This script reproduces the differential expression analysis presented in the paper Roux et al.Â
Due to patient privacy concerns patient metadata (used for Figure 1 and S1) and raw sequencing data cannot be shared publicly. The analysis code below starts from the read count matrix and associated sample and gene info files available from GEO (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE273902)
TO DO: add title and link to BioRXiv! TO DO: somehow the GO network plot is not reproduced perfectly, weird because I can reproduce it in my analysis script…
library(tidyverse)
## Gene annotation
library(ensembldb)
library(AnnotationHub)
library(org.Hs.eg.db)
library(RSQLite)
## PCA
library(SingleCellExperiment)
library(scater)
## Differential expression
library(limma)
library(edgeR)
## GSEA
library(GO.db)
library(igraph)
library(ggraph)
## TF activity
library(decoupleR)
## Ligands/receptors
library(CellChat)
## Differential abundance
library(xCell)
## Comparison to external datasets
library(GEOquery)
library(affy)
library(illuminaHumanv2.db)
library(illuminaHumanv3.db)
library(hgu133plus2.db)
## Plotting
library(ggplot2)
library(ggExtra)
library(ggrepel)
library(ggpubr)
myTheme <- theme_bw(base_size = 20) +
theme(panel.grid.major = element_line(colour = "lightgrey", linewidth = 0.2),
panel.grid.minor = element_line(colour = "lightgrey", linewidth = 0.05),
legend.position="right", legend.direction = "vertical",
legend.text=element_text(size=10),
axis.title=element_text(size=20),
axis.text=element_text(size=10),
panel.spacing.x = unit(1.2, "lines"))
myPalette <- c('#e6194b', '#4363d8', '#3cb44b', '#984EA3', '#f58231', '#ffe119', '#F781BF', '#808080', '#98BFDB', '#bcf60c', '#008080', '#e6beff', '#E5C494', '#000075', '#CD00CD', '#aaffc3', '#808000', '#9a6324', '#fffac8', '#800000', '#000000', '#ffffff')
# Useful for downloads
options(timeout=300)
## Save session info to show which versions of packages were used
library(devtools)
capture.output(session_info(), file="session_info.txt")
# rmarkdown::render("Roux_et_al_bariatric_surgery_paper.Rmd")
## Matrix of read counts
tab <- read.table(file = "data/GSE273902_Counts_Table.tsv.gz", sep = "\t", h=T)
## Info on samples
metadata <- read.table(file = "data/GSE273902_Sample_Info.tsv.gz", sep = "\t", h=T, quote = "")
metadata$Sample <- row.names(metadata)
## Info on genes
genes <- read.table(file = "data/GSE273902_Gene_Info.tsv.gz", sep = "\t", h=T, quote = "")
row.names(genes) <- genes$GENEID
## Build DGEList object
dge <- DGEList(counts = tab,
samples = metadata,
genes = genes)
## If additional information is needed for the genes, it can be retrieved with EnsemblDB, for example
ah <- AnnotationHub()
## snapshotDate(): 2023-10-23
ahDb <- query(ah, paste("Ensembl 102 EnsDb for Homo sapiens"))
edb <- ahDb[[1]]
## loading from cache
# dge$genes$SYMBOL <- mapIds(edb, key=dge$genes$GENEID, keytype="GENEID", column="SYMBOL", multiVals = "first")
## Calculate size factors
dge <- calcNormFactors(dge)
## Filter DGEList to keep paired samples (visits 1 and 4)
patient_to_keep <- names(which(rowSums(table(dge$samples$Patient, dge$samples$Visit)[, c("V1", "V4")] != 0) == 2))
sample_to_keep <- dge$samples$Visit %in% c("V1", "V4") & dge$samples$Patient %in% patient_to_keep
dge.subset <- dge[, sample_to_keep]
## Clean-up samples variables
dge.subset$samples$Patient <- factor(dge.subset$samples$Patient)
dge.subset$samples$Visit <- factor(dge.subset$samples$Visit)
table(dge.subset$samples$Visit)
##
## V1 V4
## 24 24
dge.subset$samples$Sex <- factor(dge.subset$samples$Sex)
table(dge.subset$samples$Sex)
##
## female male
## 28 20
dge.subset$samples$Patient.subgroup <- factor(dge.subset$samples$Patient.subgroup)
table(dge.subset$samples$Patient.subgroup)
##
## Insulin dependent T2D / No remission No diabetes T2D / No remission
## 8 16 8
## T2D / Remission Unknown
## 14 2
dge.subset$samples$Surgery.type <- factor(dge.subset$samples$Surgery.type)
## Simplify variable by pooling "gastric band" and "revision"
levels(dge.subset$samples$Surgery.type) <- c("other", "gastric bypass", "gastric sleeve", "other")
## Reorder
dge.subset$samples$Surgery.type <- factor(dge.subset$samples$Surgery.type, levels=levels(dge.subset$samples$Surgery.type)[c(2,3,1)])
table(dge.subset$samples$Surgery.type)
##
## gastric bypass gastric sleeve other
## 22 18 8
## Filter genes that are lowly expressed
keep <- rowSums(edgeR::cpm(dge.subset) > 1) >= 12 ## 1/4 of the samples
## Filter some gene biotypes, mostly pseudogenes
keep <- keep & (!dge.subset$genes$GENEBIOTYPE %in% c("pseudogene", "processed_pseudogene", "IG_C_pseudogene",
"IG_D_pseudogene", "IG_pseudogene", "IG_V_pseudogene", "IG_J_pseudogene",
"TR_J_pseudogene", "TR_V_pseudogene", "transcribed_processed_pseudogene",
"transcribed_unitary_pseudogene", "transcribed_unprocessed_pseudogene",
"translated_processed_pseudogene", "translated_unprocessed_pseudogene",
"unitary_pseudogene", "rRNA_pseudogene",
"unprocessed_pseudogene", "polymorphic_pseudogene", "TEC") &
!is.na(dge.subset$genes$GENEBIOTYPE))
sum(keep)
## [1] 14358
## Subset, while updating total counts
dge.subset <- dge.subset[keep, , keep.lib.sizes=FALSE]
## Redo TMM normalization
dge.subset <- calcNormFactors(dge.subset)
## Calculate CPMs
log2cpm.loess <- normalizeBetweenArrays(edgeR::cpm(dge.subset, log=TRUE, prior.count=8, normalized.lib.sizes=TRUE), method = "cyclicloess")
plotDensities(log2cpm.loess, group=dge.subset$samples$groupSimple, col=myPalette, legend = FALSE)
Principal component analysis
## Perform PCA on the 500 genes with largest inter-quartile range
iqrs <- apply(log2cpm.loess, 1, IQR)
sel <- iqrs >= sort(iqrs, decreasing = T)[500]
pca1 <- prcomp(t(log2cpm.loess[sel,]), scale = T)
## Variance explained by top PCs
summary(pca1)$importance[, 1:10]
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
## Standard deviation 13.55696 7.293266 6.131549 4.835588 4.270433 4.192822 3.90843 3.592951 3.390226 3.32974
## Proportion of Variance 0.36758 0.106380 0.075190 0.046770 0.036470 0.035160 0.03055 0.025820 0.022990 0.02217
## Cumulative Proportion 0.36758 0.473970 0.549160 0.595920 0.632400 0.667560 0.69811 0.723930 0.746910 0.76909
## Plotting
myDf <- cbind(pca1$x, dge.subset$samples)
f2p1 <- ggplot(myDf, aes(x=PC1, y=PC2, color=Visit, shape=Surgery.type)) +
geom_point(size=4) +
xlab(paste("PC1 (", round(summary(pca1)$importance[2,1],3)*100, "% variance)", sep="")) +
ylab(paste("PC2 (", round(summary(pca1)$importance[2,2],3)*100, "% variance)", sep="")) +
scale_color_manual(labels=c("Visit 1", "Visit 4"), values=myPalette[c(2,1)]) + ## v4 = red (up in DE analysis)
scale_shape_manual(values=c(15:17)) +
myTheme +
theme(legend.position="bottom")
## Figure 2A
f2p1 <- ggMarginal(f2p1, groupColour = TRUE, groupFill = TRUE)
plot(f2p1)
## Other interesting plots, not shown in the paper
## PC3 vs. 4 colored by sex
f <- ggplot(myDf, aes(x=PC3, y=PC4, color=Sex, shape=Visit)) +
geom_point(size=4) +
xlab(paste("PC3 (", round(summary(pca1)$importance[2,3],3)*100, "% variance)", sep="")) +
ylab(paste("PC4 (", round(summary(pca1)$importance[2,4],3)*100, "% variance)", sep="")) +
scale_color_manual(values=myPalette[3:4]) +
scale_shape_manual(values=15:16) +
myTheme +
theme(legend.position="bottom")
f <- ggMarginal(f, groupColour = TRUE, groupFill = TRUE)
plot(f)
f <- ggplot(myDf[myDf$Patient.subgroup != "Unknown", ],
aes(x=PC7, y=PC9, color=Patient.subgroup, shape=Visit)) +
geom_point(size=4) +
xlab(paste("PC7 (", round(summary(pca1)$importance[2,7],3)*100, "% variance)", sep="")) +
ylab(paste("PC9 (", round(summary(pca1)$importance[2,9],3)*100, "% variance)", sep="")) +
scale_color_manual(name="Patient subgroup",
values=setNames(myPalette[c(9,5,6,7,8)], c("No diabetes", "Insulin dependent T2D / No remission", "T2D / No remission", "T2D / Remission", "Unknown"))) +
scale_shape_manual(values=15:16) +
myTheme +
theme(legend.position="bottom") +
guides(color=guide_legend(ncol=2))
f <- ggMarginal(f, groupColour = TRUE, groupFill = TRUE)
plot(f)
## Percentage of variance of each PC explained by different factors
## Put normalized data into a SingleCellExperiment object to be used in function getExplanatoryPCs
sce <- SingleCellExperiment(assays=list(logcounts=log2cpm.loess),
reducedDims = list(PCA=pca1$x),
colData=dge.subset$samples)
percentVar <- getExplanatoryPCs(sce,
n_dimred = 48,
dimred = "PCA",
variables = c("Visit", "Patient", "Sex",
"Patient.subgroup", "Surgery.type"))
head(percentVar, n=10)
## Visit Patient Sex Patient.subgroup Surgery.type
## PC1 23.3007512 39.25754 1.2461776 6.597526 9.2467516
## PC2 0.3629822 65.29105 19.6800008 9.433735 8.3294502
## PC3 0.9991333 82.42652 23.2322872 29.273784 6.1749170
## PC4 4.4526713 82.20814 33.3452480 9.816160 1.7559173
## PC5 0.1148688 91.30424 9.0065640 3.770498 10.1595481
## PC6 0.3382346 85.61496 0.8068154 12.051550 2.1040895
## PC7 4.2925295 76.05769 5.3417397 17.572582 0.5431932
## PC8 0.6013340 78.83051 0.2964702 10.028197 9.0705173
## PC9 0.1353582 91.01690 1.3315236 19.424360 9.0489063
## PC10 0.1108903 95.52493 1.5439451 15.080473 1.8282309
## Plotting
myDf <- data.frame(PC = rep(seq_len(10), ncol(percentVar)),
Factor = rep(factor(colnames(percentVar), levels = c("Patient", "Visit", "Sex", "Surgery.type", "Patient.subgroup")), each = 10),
Pct_var_explained = as.numeric(percentVar[1:10, ]),
check.names = FALSE)
f2p2 <- ggplot(myDf, aes(x = PC, y = Pct_var_explained, fill = Factor)) +
geom_bar(stat="identity", width = 0.7, position=position_dodge2(padding=0.05)) +
scale_fill_manual(values = myPalette[c(10, 1, 3, 13, 7)]) +
xlab("Principal Component") +
ylab("% variance explained") +
coord_cartesian(ylim = c(0, 100), expand = FALSE) +
scale_x_continuous(breaks = c(1:10)) +
myTheme +
theme(panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.position="bottom") +
guides(fill=guide_legend(ncol=3))
## Figure 2B
f2p2
## design matrix
moma <- model.matrix(~ 0 + Visit + Patient, data=dge.subset$samples)
colnames(moma) <- gsub("Visit", "", colnames(moma))
## Contrast matrix
contrasts.matrix <- makeContrasts(
Post_vs_pre = V4 - V1,
levels=moma
)
v <- voom(dge.subset, moma, plot=TRUE, normalize.method = "cyclicloess")
fit <- lmFit(v, moma)
fit2 <- contrasts.fit(fit, contrasts.matrix)
fit2 <- eBayes(fit2, robust=TRUE) ## avoids shrinkage of large variances
## Genes DE at FDR 5%
table(de <- decideTests(fit2, p.value = 0.05))
##
## -1 0 1
## 884 13011 463
## Top 10 genes by significance
topTable(fit2, coef=1, p.value=0.05, n=10, sort.by = "P")
## GENEID SYMBOL DESCRIPTION GENEBIOTYPE ENTREZID UNIPROT LENGTH logFC
## ENSG00000173585 ENSG00000173585 CCR9 C-C motif chemokine receptor 9 protein_coding 10803 B2R734 2807 1.1444969
## ENSG00000239571 ENSG00000239571 IGKV2D-30 immunoglobulin kappa variable 2D-30 IG_V_gene NA <NA> 395 2.0648001
## ENSG00000180537 ENSG00000180537 RNF182 ring finger protein 182 protein_coding 221687 A0A024QZW5 4098 -1.7403616
## ENSG00000095932 ENSG00000095932 SMIM24 small integral membrane protein 24 protein_coding 284422 O75264 1481 -1.8153217
## ENSG00000132465 ENSG00000132465 JCHAIN joining chain of multimeric IgA and IgM protein_coding 3512 P01591 1546 1.9230039
## ENSG00000163554 ENSG00000163554 SPTA1 spectrin alpha, erythrocytic 1 protein_coding 6708 P02549 10108 -1.3947816
## ENSG00000259040 ENSG00000259040 BLOC1S5-TXNDC5 BLOC1S5-TXNDC5 readthrough (NMD candidate) protein_coding NA <NA> 3030 0.8814465
## ENSG00000239264 ENSG00000239264 TXNDC5 thioredoxin domain containing 5 protein_coding 81567 A0A024QZV0 4361 0.8878952
## ENSG00000241294 ENSG00000241294 IGKV2-24 immunoglobulin kappa variable 2-24 IG_V_gene NA <NA> 390 1.9632546
## ENSG00000048462 ENSG00000048462 TNFRSF17 TNF receptor superfamily member 17 protein_coding 608 Q02223 1015 1.6891936
## AveExpr t P.Value adj.P.Val B
## ENSG00000173585 0.6221627 7.137526 9.635113e-08 0.001383410 7.163198
## ENSG00000239571 1.4115440 6.691342 3.054725e-07 0.001831252 6.552504
## ENSG00000180537 -1.0279598 -6.554139 4.374899e-07 0.001831252 6.100028
## ENSG00000095932 1.4076978 -6.495659 5.101692e-07 0.001831252 6.183886
## ENSG00000132465 5.9339808 6.117238 1.390180e-06 0.003992041 5.341951
## ENSG00000163554 -0.3731375 -5.952851 2.157171e-06 0.005162110 4.489506
## ENSG00000259040 5.8591142 5.773396 3.493046e-06 0.006232159 4.480450
## ENSG00000239264 5.8351068 5.748790 3.732339e-06 0.006232159 4.418202
## ENSG00000241294 1.8923204 5.696797 4.293790e-06 0.006232159 4.214713
## ENSG00000048462 0.6441979 5.692261 4.346637e-06 0.006232159 3.857395
## Top 10 genes by logFC
topTable(fit2, coef=1, p.value=0.05, n=10, sort.by = "logFC")
## GENEID SYMBOL DESCRIPTION GENEBIOTYPE ENTREZID UNIPROT LENGTH logFC AveExpr
## ENSG00000239571 ENSG00000239571 IGKV2D-30 immunoglobulin kappa variable 2D-30 IG_V_gene NA <NA> 395 2.064800 1.4115440
## ENSG00000241294 ENSG00000241294 IGKV2-24 immunoglobulin kappa variable 2-24 IG_V_gene NA <NA> 390 1.963255 1.8923204
## ENSG00000275527 ENSG00000275527 AC100835.2 novel transcript lncRNA NA <NA> 479 -1.956837 0.3429435
## ENSG00000167992 ENSG00000167992 VWCE von Willebrand factor C and EGF domains protein_coding 220001 Q96DN2 4766 -1.925803 2.5032452
## ENSG00000132465 ENSG00000132465 JCHAIN joining chain of multimeric IgA and IgM protein_coding 3512 P01591 1546 1.923004 5.9339808
## ENSG00000095932 ENSG00000095932 SMIM24 small integral membrane protein 24 protein_coding 284422 O75264 1481 -1.815322 1.4076978
## ENSG00000243238 ENSG00000243238 IGKV2-30 immunoglobulin kappa variable 2-30 IG_V_gene NA <NA> 390 1.792634 3.0282818
## ENSG00000130300 ENSG00000130300 PLVAP plasmalemma vesicle associated protein protein_coding 83483 Q9BX97 2295 -1.769098 1.2712457
## ENSG00000243264 ENSG00000243264 IGKV2D-29 immunoglobulin kappa variable 2D-29 IG_V_gene NA <NA> 400 1.742866 0.1837183
## ENSG00000180537 ENSG00000180537 RNF182 ring finger protein 182 protein_coding 221687 A0A024QZW5 4098 -1.740362 -1.0279598
## t P.Value adj.P.Val B
## ENSG00000239571 6.691342 3.054725e-07 0.001831252 6.55250410
## ENSG00000241294 5.696797 4.293790e-06 0.006232159 4.21471264
## ENSG00000275527 -5.639226 5.015494e-06 0.006232159 3.91406109
## ENSG00000167992 -5.468556 7.957993e-06 0.007928524 3.70020259
## ENSG00000132465 6.117238 1.390180e-06 0.003992041 5.34195144
## ENSG00000095932 -6.495659 5.101692e-07 0.001831252 6.18388602
## ENSG00000243238 5.359157 1.070610e-05 0.008055343 3.42466559
## ENSG00000130300 -5.407964 9.378410e-06 0.007928524 3.48492055
## ENSG00000243264 3.949592 4.872994e-04 0.022485176 -0.06752678
## ENSG00000180537 -6.554139 4.374899e-07 0.001831252 6.10002830
## Diagnostic plots
plotSA(fit2)
top <- topTable(fit2, coef=1, p.value=1, n=Inf, sort.by = "none")
hist(top$P.Value, n=100)
## CellChatDB ##################################################################
CellChatDB <- CellChatDB.human
## Extract all ligands
cellchat_ligands <- CellChatDB$interaction |>
dplyr::filter(annotation == "Secreted Signaling") |>
dplyr::left_join(y = CellChatDB$geneInfo, by = join_by(ligand == Symbol)) |>
dplyr::select(ligand) |>
distinct() |>
dplyr::filter(ligand != "") |>
unlist()
## Add complexes
CellChatDB.complex <- CellChatDB$complex |>
as_tibble(rownames = "Complex") |>
pivot_longer(!Complex, names_to = "Subunit", values_to = "Symbol") |>
dplyr::filter(Symbol != "") |>
distinct()
cellchat_ligands <- c(cellchat_ligands,
CellChatDB$interaction |>
dplyr::filter(annotation == "Secreted Signaling") |>
dplyr::left_join(y = CellChatDB.complex,
by = join_by(ligand == Complex),
relationship = "many-to-many") |>
dplyr::filter(!is.na(Symbol)) |>
dplyr::left_join(y = CellChatDB$geneInfo, by = join_by(Symbol == Symbol)) |>
dplyr::select(Symbol) |>
dplyr::filter(Symbol != "") |>
distinct() |>
unlist()) |>
unique()
# Same for receptors
cellchat_receptors <- CellChatDB$interaction |>
dplyr::filter(annotation == "Secreted Signaling") |>
dplyr::left_join(y = CellChatDB$geneInfo, by = join_by(receptor == Symbol)) |>
dplyr::select(receptor) |>
distinct() |>
dplyr::filter(receptor != "") |>
unlist()
## Add complexes
cellchat_receptors <- c(cellchat_receptors,
CellChatDB$interaction |>
dplyr::filter(annotation == "Secreted Signaling") |>
dplyr::left_join(y = CellChatDB.complex,
by = join_by(receptor == Complex),
relationship = "many-to-many") |>
dplyr::filter(!is.na(Symbol)) |>
dplyr::left_join(y = CellChatDB$geneInfo, by = join_by(Symbol == Symbol)) |>
dplyr::select(Symbol) |>
dplyr::filter(Symbol != "") |>
distinct() |>
unlist()) |>
unique()
## CellTalkDB ##################################################################
CellTalkDB <- readRDS(url("https://github.com/ZJUFanLab/CellTalkDB/raw/f33e0e6dd3c143193d593b5dad8c613280b94699/database/human_lr_pair.rds"))
## Let's convert protein IDs to gene IDs using EnsemblDB
celltalk_ligands <- mapIds(edb, key=unique(CellTalkDB$ligand_ensembl_protein_id), keytype="PROTEINID", column="GENEID")
## Warning: Unable to map 21 of 816 requested IDs.
celltalk_ligands <- celltalk_ligands[!is.na(celltalk_ligands)]
## Same for receptors
celltalk_receptors <- mapIds(edb, key=unique(CellTalkDB$receptor_ensembl_protein_id), keytype="PROTEINID", column="GENEID")
## Warning: Unable to map 14 of 781 requested IDs.
celltalk_receptors <- celltalk_receptors[!is.na(celltalk_receptors)]
## CellPhoneDB #################################################################
CellPhoneDB <- read.csv(file = 'https://raw.githubusercontent.com/ventolab/cellphonedb-data/753d63da75dc11730edc0c4b01fbc9100584f77d/data/interaction_input.csv')
complex_input <- read.csv(file = 'https://raw.githubusercontent.com/ventolab/cellphonedb-data/753d63da75dc11730edc0c4b01fbc9100584f77d/data/complex_input.csv', row.names = 1)
geneInfo <- read.csv(file = 'https://raw.githubusercontent.com/ventolab/cellphonedb-data/753d63da75dc11730edc0c4b01fbc9100584f77d/data/gene_input.csv')
## process similar to CellChatDB
cellphone_ligands <- CellPhoneDB |>
dplyr::filter(directionality == "Ligand-Receptor") |>
dplyr::left_join(y = geneInfo,
by = join_by(partner_a == uniprot),
relationship = "many-to-many") |>
dplyr::select(ensembl) |>
distinct() |>
dplyr::filter(ensembl != "") |>
unlist()
## Add complexes
cellphoneDB.complex <- complex_input |>
as_tibble(rownames = "Complex") |>
pivot_longer(cols = starts_with("uniprot_"), names_to = "Subunit", values_to = "uniprot") |>
dplyr::filter(uniprot != "") |>
distinct() |>
dplyr::select(Complex, uniprot)
cellphone_ligands <- c(cellphone_ligands,
CellPhoneDB |>
dplyr::filter(directionality == "Ligand-Receptor") |>
dplyr::left_join(y = cellphoneDB.complex,
by = join_by(partner_a == Complex),
relationship = "many-to-many") |>
dplyr::filter(!is.na(uniprot)) |>
dplyr::left_join(y = geneInfo,
by = join_by(uniprot == uniprot),
relationship = "many-to-many") |>
dplyr::select(ensembl) |>
dplyr::filter(ensembl != "") |>
distinct() |>
unlist()) |>
unique()
## receptors
cellphone_receptors <- CellPhoneDB |>
dplyr::filter(directionality == "Ligand-Receptor") |>
dplyr::left_join(y = geneInfo,
by = join_by(partner_b == uniprot),
relationship = "many-to-many") |>
dplyr::select(ensembl) |>
distinct() |>
dplyr::filter(ensembl != "") |>
unlist()
## Add complexes
cellphone_receptors <- c(cellphone_receptors,
CellPhoneDB |>
dplyr::filter(directionality == "Ligand-Receptor") |>
dplyr::left_join(y = cellphoneDB.complex,
by = join_by(partner_b == Complex),
relationship = "many-to-many") |>
dplyr::filter(!is.na(uniprot)) |>
dplyr::left_join(y = geneInfo,
by = join_by(uniprot == uniprot),
relationship = "many-to-many") |>
dplyr::select(ensembl) |>
dplyr::filter(ensembl != "") |>
distinct() |>
unlist()) |>
unique()
## Merge 3 databases together, keeping only genes in dge.subset ################
all_ligands <- unique(c(dge.subset$genes$GENEID[dge.subset$genes$SYMBOL %in% cellchat_ligands],
dge.subset$genes$GENEID[dge.subset$genes$GENEID %in% celltalk_ligands],
dge.subset$genes$GENEID[dge.subset$genes$GENEID %in% cellphone_ligands]))
all_receptors <- unique(c(dge.subset$genes$GENEID[dge.subset$genes$SYMBOL %in% cellchat_receptors],
dge.subset$genes$GENEID[dge.subset$genes$GENEID %in% celltalk_receptors],
dge.subset$genes$GENEID[dge.subset$genes$GENEID %in% cellphone_receptors]))
## When overlap between ligand and receptor list, keep genes only as ligand
all_receptors <- all_receptors[!all_receptors %in% all_ligands]
## Save in case some resources become unavailable
# saveRDS(all_ligands, "data/ligands_combined.rds")
# saveRDS(all_receptors, "data/receptors_combined.rds")
MA plots
# all_ligands <- readRDS("data/ligands_combined.rds")
# all_receptors <- readRDS("data/receptors_combined.rds")
myDf <- top
myDf$Significant <- FALSE
myDf$Significant[myDf$adj.P.Val <= 0.05 & myDf$logFC > 0] <- "Up"
myDf$Significant[myDf$adj.P.Val <= 0.05 & myDf$logFC < 0] <- "Down"
myDf$Significant <- factor(myDf$Significant, levels=c("Up", "Down", FALSE))
## Sort points so that significant points are plotted last
myDf <- myDf[order(myDf$Significant, decreasing = T), ]
f <- ggplot(myDf, aes(x=AveExpr, y=logFC)) +
ylab(bquote('log'[2]~'Fold-Change')) +
xlab("Average expression") +
geom_point(aes(color=Significant, alpha=Significant), shape=16) +
scale_color_manual(name="Significant",
limits=c("Up", "Down"),
values=unname(c(myPalette[1], myPalette[2])),
na.value = "grey80") +
scale_alpha_manual(values=c(1,1,0.6), guide="none") +
myTheme +
theme(legend.position="bottom")
## Label ligands
myDf$ligand <- myDf$SYMBOL
myDf$ligand[myDf$adj.P.Val > 0.05 | !myDf$GENEID %in% all_ligands] <- ""
f3p1 <- f + geom_label_repel(data = myDf[myDf$ligand != "", ],
aes(x=AveExpr, y=logFC),
label=myDf[myDf$ligand != "", ]$ligand,
min.segment.length = unit(0, 'lines'), # always draw segment pointing to the gene
size=3,
color="black",
direction = "both",
point.padding = 0.1,
box.padding = 0.3,
max.overlaps = 25,
seed=1234)
f3p1
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider increasing max.overlaps
## Label receptors
myDf$receptor <- myDf$SYMBOL
myDf$receptor[myDf$adj.P.Val > 0.05 | !myDf$GENEID %in% all_receptors] <- ""
fs2p1 <- f + geom_label_repel(data = myDf[myDf$receptor != "", ],
aes(x=AveExpr, y=logFC),
label=myDf[myDf$receptor != "", ]$receptor,
min.segment.length = unit(0, 'lines'),
size=3,
color="black",
direction = "both",
point.padding = 0.1,
box.padding = 0.3,
max.overlaps = 25,
seed=1234)
fs2p1
Boxplots showing normalized expression of top DE genes
top <- topTable(fit2, coef=1, p.value=0.01, n=Inf, sort.by = "P")
# Remove 2 genes because they have identical sequences (same reads are counted)
top <- top[!top$SYMBOL %in% c("BLOC1S5-TXNDC5", # same sequence as TXNDC5
"IGKV2-30"), ] # same sequence as IGKV2D-30
myDf <- log2cpm.loess[top$GENEID,] |>
as_tibble(rownames = NA) |>
rownames_to_column() |>
dplyr::rename(Gene = rowname) |>
pivot_longer(cols= colnames(log2cpm.loess),
names_to = "Sample",
values_to = "log2CPM") |>
dplyr::left_join(y=rownames_to_column(as_tibble(dge.subset$samples)), by = join_by("Sample")) |>
dplyr::left_join(y=as_tibble(dge.subset$genes), by=join_by("Gene" == "GENEID"))
## Reorder subgroups
myDf$Patient.subgroup <- factor(myDf$Patient.subgroup, levels = c("No diabetes", "Insulin dependent T2D / No remission", "T2D / No remission", "T2D / Remission", "Unknown"))
## Order of genes: significance + separate up (first) and down-regulated genes (then)
myDf$SYMBOL <- factor(myDf$SYMBOL, levels=c(top[top$logFC > 0, ]$SYMBOL, top[top$logFC < 0, ]$SYMBOL))
## Same plot colored by subgroup (see CCR9 with higher expression in no_diabetes)
fs2p2 <- ggplot(myDf, aes(x=Visit, y=log2CPM, group=Patient, col=Patient.subgroup)) +
facet_wrap( ~ SYMBOL, scales = "free_y", ncol = 5) +
geom_line(position = position_dodge(0.2), alpha = .5) +
geom_point(position = position_dodge(0.2), alpha = .8, size=2) +
scale_colour_manual(name="Patient subgroup", values=setNames(myPalette[c(9,5,6,7,8)], c("No diabetes", "Insulin dependent T2D / No remission", "T2D / No remission", "T2D / Remission", "unknown"))) +
ylab(bquote('log'[2]~'CPM')) +
xlab("") +
myTheme +
theme(strip.background = element_blank(),
strip.text.x = element_text(size = 14, vjust = 0, face = "bold"),
strip.clip = "off",
panel.spacing.x = unit(3, "points"),
panel.spacing.y = unit(0, "points"),
axis.text.y = element_text(size = 8),
legend.position="bottom")
fs2p2
Transcription factor activity inference
We obtained the TF to target map from the CollecTRI
collection. We used the decoupleR package version 2.9.7
(Bioconductor 3.18), the mapping with more recent versions might differ,
so below we read directly from a saved object.
# net <- get_collectri(organism='human', split_complexes=FALSE)
# saveRDS(net, file="data/CollecTRI.rds")
net <- readRDS(file="data/CollecTRI.rds")
# We infer TF activities from the t-values of the DEGs between v4 and v1
top <- topTable(fit2, n=Inf, p=1, sort.by = "none")[!duplicated(dge.subset$genes$SYMBOL), ]
row.names(top) <- top$SYMBOL
contrast_acts <- run_ulm(mat=top[, 't', drop=FALSE], net=net, minsize = 10)
## Perform FDR correction and keep significant TFs
f_contrast_acts <- contrast_acts |>
mutate(FDR = p.adjust(contrast_acts$p_value, method="BH")) |>
dplyr::filter(FDR < 0.1) |>
arrange(score)
f_contrast_acts
## # A tibble: 54 × 6
## statistic source condition score p_value FDR
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 ulm SP1 t -8.05 8.84e-16 3.72e-13
## 2 ulm GATA1 t -6.37 1.99e-10 4.18e- 8
## 3 ulm ESR2 t -4.77 1.84e- 6 2.58e- 4
## 4 ulm GATA2 t -4.37 1.23e- 5 1.04e- 3
## 5 ulm SMAD5 t -4.37 1.24e- 5 1.04e- 3
## 6 ulm ESR1 t -4.11 4.03e- 5 2.83e- 3
## 7 ulm EGR1 t -4.00 6.28e- 5 3.78e- 3
## 8 ulm RUNX1 t -3.93 8.54e- 5 4.49e- 3
## 9 ulm SP3 t -3.90 9.82e- 5 4.59e- 3
## 10 ulm HIF1A t -3.81 1.42e- 4 5.68e- 3
## # ℹ 44 more rows
## Plot for paper
f3p2 <- ggplot(f_contrast_acts, aes(x = reorder(source, score), y = score)) +
geom_bar(aes(fill = score), stat = "identity") +
xlab("") +
scale_fill_gradient2(low = myPalette[2], high = myPalette[1], mid = "white", midpoint = 0) +
myTheme +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank())
f3p2
We download the pathways from the MSigDB version
2023.2.Hs database, available on the release archive
if(!file.exists("data/msigdb_v2023.2.Hs.db")){
download.file("https://data.broadinstitute.org/gsea-msigdb/msigdb/release/2023.2.Hs/msigdb_v2023.2.Hs.db.zip", destfile = "data/msigdb_v2023.2.Hs.db.zip")
unzip("data/msigdb_v2023.2.Hs.db.zip", exdir = "data/")
}
## connect to db
con <- dbConnect(drv=SQLite(), dbname="data/msigdb_v2023.2.Hs.db")
temp <- dbGetQuery(conn=con, statement="SELECT gene_set.standard_name, gene_set.collection_name, gene_symbol.symbol
FROM gene_set
LEFT JOIN gene_set_gene_symbol
ON gene_set.id = gene_set_gene_symbol.gene_set_id
LEFT JOIN gene_symbol
ON gene_symbol.id = gene_set_gene_symbol.gene_symbol_id
WHERE gene_set.collection_name='C2:CP:REACTOME' OR gene_set.collection_name='C2:CP:WIKIPATHWAYS'")
table(temp$collection_name)
##
## C2:CP:REACTOME C2:CP:WIKIPATHWAYS
## 95226 33947
## Let's turn this into structured lists, one per collection
collections <- sort(unique(temp$collection_name))
for (f in collections) {
cat(f, "\n")
myList <- tapply(temp[temp$collection_name %in% f, ],
factor(temp$standard_name[temp$collection_name %in% f]),
function(x){
return(dge.subset$genes$GENEID[dge.subset$genes$SYMBOL %in% unique(x$symbol)])
})
myCollection <- paste0(make.names(f), ".CATEGORY.v2023.2.Hs.ensembl.hsa")
assign(myCollection, myList)
}
## C2:CP:REACTOME
## C2:CP:WIKIPATHWAYS
rm(temp)
dbDisconnect(con)
For the paper we obtained the GO mapping from the
org.Hs.eg.db annotation package version 3.18. The mapping
with other Bioconductor versions might differ, so below we read directly
from a saved object in an .rds file.
## Perform the query
# updatedGO <- AnnotationDbi::select(org.Hs.eg.db, keys=dge.subset$genes$GENEID, keytype = "ENSEMBL", columns=c("GOALL", "EVIDENCEALL", "ONTOLOGYALL"))
# updatedGO <- tapply(updatedGO$ENSEMBL, list(updatedGO$GOALL), unique)
# saveRDS(updatedGO, file="data/GO_mapping.rds")
updatedGO <- readRDS(file="data/GO_mapping.rds")
This performs a self-contained enrichment test. All results are
stored into a list resRoast
# Used below for calculation of average logFC
top <- topTable(fit2, p.value = 1, n=Inf, sort="none")
geneList <- setNames(top$logFC, top$GENEID)
resRoast <- list()
for (f in c(paste0(make.names(collections), ".CATEGORY.v2023.2.Hs.ensembl.hsa"), "updatedGO")) {
resRoast[[f]] <- list()
## process and filter gene sets
indx <- ids2indices(gene.sets=get(f), identifiers=rownames(fit2$genes))
indx <- indx[sapply(indx, length) >= 10]
## enrichment test
resRoast[[f]] <- fry(log2cpm.loess, indx, moma, contrast = contrasts.matrix, sort = "directional")
## Add average logFC for every gene set
resRoast[[f]]$aveLog2FC <- sapply(indx, function(x) mean(geneList[x], na.rm=T))[rownames(resRoast[[f]])]
}
# Add useful info for GO terms
resRoast$updatedGO <- cbind(resRoast$updatedGO, AnnotationDbi::select(GO.db, keys=row.names(resRoast$updatedGO), keytype = "GOID", columns=c("ONTOLOGY", "TERM", "DEFINITION")))
## 'select()' returned 1:1 mapping between keys and columns
# Results used in the paper:
resRoast$updatedGO |> dplyr::select(ONTOLOGY, TERM, DEFINITION, NGenes, Direction, aveLog2FC, FDR) |> head()
## ONTOLOGY TERM
## GO:0046716 BP muscle cell cellular homeostasis
## GO:0098632 MF cell-cell adhesion mediator activity
## GO:0086065 BP cell communication involved in cardiac conduction
## GO:0009110 BP vitamin biosynthetic process
## GO:0042440 BP pigment metabolic process
## GO:0031593 MF polyubiquitin modification-dependent protein binding
## DEFINITION
## GO:0046716 The cellular homeostatic process that preserves a muscle cell in a stable functional or structural state.
## GO:0098632 The binding by a cell-adhesion protein on the cell surface to an extracellular matrix component, to mediate adhesion of the cell to another cell.
## GO:0086065 Any process that mediates interactions between a cell and its surroundings that contributes to the process of cardiac conduction. Encompasses interactions such as signaling or attachment between one cell and another cell, between a cell and an extracellular matrix, or between a cell and any other aspect of its environment.
## GO:0009110 The chemical reactions and pathways resulting in the formation of a vitamin, one of a number of unrelated organic substances that occur in many foods in small amounts and that are necessary in trace amounts for the normal metabolic functioning of the body.
## GO:0042440 The chemical reactions and pathways involving pigment, any general or particular coloring matter in living organisms, e.g. melanin.
## GO:0031593 Binding to a protein upon poly-ubiquitination of the target protein.
## NGenes Direction aveLog2FC FDR
## GO:0046716 16 Down -0.11044773 0.03155728
## GO:0098632 26 Down -0.17308639 0.03155728
## GO:0086065 22 Down -0.18108434 0.03155728
## GO:0009110 17 Down -0.08775647 0.03155728
## GO:0042440 53 Down -0.15012531 0.03155728
## GO:0031593 54 Down -0.09560465 0.03181636
resRoast$C2.CP.REACTOME.CATEGORY.v2023.2.Hs.ensembl.hsa |> dplyr::select(NGenes, Direction, aveLog2FC, FDR) |> head()
## NGenes Direction aveLog2FC FDR
## REACTOME_RHO_GTPASES_ACTIVATE_KTN1 10 Down -0.1693490 0.01320083
## REACTOME_HEME_BIOSYNTHESIS 13 Down -0.3726414 0.01320083
## REACTOME_ADRENALINE_NORADRENALINE_INHIBITS_INSULIN_SECRETION 13 Down -0.1630842 0.01626186
## REACTOME_THROMBIN_SIGNALLING_THROUGH_PROTEINASE_ACTIVATED_RECEPTORS_PARS 21 Down -0.1851271 0.01626186
## REACTOME_THROMBOXANE_SIGNALLING_THROUGH_TP_RECEPTOR 16 Down -0.1838983 0.01626186
## REACTOME_ATF6_ATF6_ALPHA_ACTIVATES_CHAPERONE_GENES 10 Up 0.1649199 0.01626186
resRoast$C2.CP.WIKIPATHWAYS.CATEGORY.v2023.2.Hs.ensembl.hsa |> dplyr::select(NGenes, Direction, aveLog2FC, FDR) |> head()
## NGenes Direction aveLog2FC FDR
## WP_GLYCOLYSIS_IN_SENESCENCE 11 Down -0.17272946 0.03348947
## WP_NAD_METABOLISM_IN_ONCOGENE_INDUCED_SENESCENCE_AND_MITOCHONDRIAL_DYSFUNCTION_ASSOCIATED_SENESCENCE 18 Down -0.11546673 0.03348947
## WP_VITAMIN_B12_METABOLISM 27 Down -0.17558880 0.03348947
## WP_METABOLIC_REPROGRAMMING_IN_PANCREATIC_CANCER 39 Down -0.10933796 0.03348947
## WP_HEMATOPOIETIC_STEM_CELL_DIFFERENTIATION 39 Down -0.27127520 0.03348947
## WP_2Q13_COPY_NUMBER_VARIATION_SYNDROME 45 Down -0.06125964 0.03348947
Display the gene overlap between significant gene sets
## Function inspired from enrichplot:::emapplot.enrichResult
prepare_graph <- function(x, ## a roast/camera output: subset to the categories to plot
min_edge = 0.1, ## min fraction of overlap
pair_sim = NULL, # x@termsim from pairwise_termsim(), or a similarity matrix. row and columns should match row.names of x
NGenes = "NGenes", # column with set size info
colVar = "aveLog2FC", # column used for coloring
...){
if (!is.numeric(min_edge) | min_edge < 0 | min_edge > 1) {
stop('"min_edge" should be a number between 0 and 1.')
}
y <- as.data.frame(x)
y$ID <- row.names(y)
## Similarity matrix
w <- as.matrix(pair_sim[row.names(y), row.names(y)])
## Pivot
wd <- w |>
as_tibble(rownames = "category1") |>
pivot_longer(cols= colnames(w),
names_to = "category2",
values_to = "similarity") |>
distinct() |>
dplyr::filter(category1 != category2) |>
dplyr::filter(!is.na(similarity)) |>
as.data.frame()
## Make graph
g <- graph.data.frame(wd[, c("category1", "category2")], directed=FALSE)
E(g)$width <- wd[, "similarity"]
## Use similarity as the weight(length) of an edge
E(g)$weight <- wd[, "similarity"]
## Delete when overlap is low
g <- delete.edges(g, E(g)[wd[, "similarity"] < min_edge])
## Size of vertices
V(g)$size <- y[V(g)$name, NGenes]
## Value used for coloring of the vertices
V(g)$color <- y[V(g)$name, colVar]
return(g)
}
## Reactome results ############################################################
myResults <- resRoast$C2.CP.REACTOME.CATEGORY.v2023.2.Hs.ensembl.hsa
## Filter out gene sets that are very large, and keep the significant ones with largest effect size
myResults <- myResults[myResults$FDR < 0.05 &
myResults$NGenes < 100 &
abs(myResults$aveLog2FC) > 0.1, ]
dim(myResults)
## [1] 69 7
## Build similarity matrix (Jaccard coefficient)
geneSets <- C2.CP.REACTOME.CATEGORY.v2023.2.Hs.ensembl.hsa[row.names(myResults)]
n <- length(geneSets)
w <- matrix(NA, nrow = n, ncol = n)
colnames(w) <- rownames(w) <- names(geneSets)
for (i in seq_len(n - 1)) {
for (j in (i + 1):n) {
w[i, j] <- length(intersect(geneSets[[i]], geneSets[[j]]))/length(unique(c(geneSets[[i]], geneSets[[j]])))
}
}
## process and simplify names
pair_sim <- w[row.names(myResults), row.names(myResults)]
row.names(myResults) <- gsub("^REACTOME_", "", row.names(myResults))
colnames(pair_sim) <- rownames(pair_sim) <- row.names(myResults)
g <- prepare_graph(myResults,
min_edge = 0.1,
pair_sim = pair_sim,
NGenes = "NGenes",
colVar = "aveLog2FC",
layout = "nicely")
## Cut gene sets names that are too long
V(g)$name[nchar(V(g)$name) > 50] <- paste0(substr(V(g)$name[nchar(V(g)$name) > 50], 1, 47), "...")
set.seed(1234) ## because graph plotting function is not deterministic
f4 <- ggraph(g, layout="nicely") +
geom_edge_link(aes(edge_width = width),
alpha=1, colour = "darkgrey") +
geom_node_point(aes(size = size,
fill = color),
shape = 21, color="grey10") +
geom_node_text(aes(label=name),
size = 3, bg.color = "white", repel=TRUE) +
scale_size_continuous(name = "Number of genes", limits = c(10,100), range=c(1,10)) +
scale_edge_width_continuous(name = "Jaccard overlap", limits = c(0.1,1), range = c(0.1, 2)) +
scale_fill_gradient2(name = "aveLog2FC", low = myPalette[2], high = myPalette[1]) +
coord_equal() +
theme_void()
f4
## Wikipathways results ########################################################
myResults <- resRoast$C2.CP.WIKIPATHWAYS.CATEGORY.v2023.2.Hs.ensembl.hsa
myResults <- myResults[myResults$FDR < 0.05 &
myResults$NGenes < 100 &
abs(myResults$aveLog2FC) > 0.1, ]
dim(myResults)
## [1] 25 7
geneSets <- C2.CP.WIKIPATHWAYS.CATEGORY.v2023.2.Hs.ensembl.hsa[row.names(myResults)]
n <- length(geneSets)
w <- matrix(NA, nrow = n, ncol = n)
colnames(w) <- rownames(w) <- names(geneSets)
for (i in seq_len(n - 1)) {
for (j in (i + 1):n) {
w[i, j] <- length(intersect(geneSets[[i]], geneSets[[j]]))/length(unique(c(geneSets[[i]], geneSets[[j]])))
}
}
pair_sim <- w[row.names(myResults), row.names(myResults)]
row.names(myResults) <- gsub("^WP_", "", row.names(myResults))
colnames(pair_sim) <- rownames(pair_sim) <- row.names(myResults)
g <- prepare_graph(myResults,
min_edge = 0.1,
pair_sim = pair_sim,
NGenes = "NGenes",
colVar = "aveLog2FC",
layout = "nicely")
V(g)$name[nchar(V(g)$name) > 50] <- paste0(substr(V(g)$name[nchar(V(g)$name) > 50], 1, 47), "...")
set.seed(1234) ## because graph plotting function is not deterministic
fs4p1 <- ggraph(g, layout="nicely") +
geom_edge_link(aes(edge_width = width),
alpha=1, colour = "darkgrey") +
geom_node_point(aes(size = size,
fill = color),
shape = 21, color="grey10") +
geom_node_text(aes(label=name),
size = 3, bg.color = "white", repel=TRUE) +
scale_size_continuous(name = "Number of genes", limits = c(10,100), range=c(1,10)) +
scale_edge_width_continuous(name = "Jaccard overlap", limits = c(0.1,1), range = c(0.1, 2)) +
scale_fill_gradient2(name = "aveLog2FC", low = myPalette[2], high = myPalette[1]) +
coord_equal() +
theme_void()
fs4p1
## Gene Ontology results #######################################################
myResults <- resRoast$updatedGO
myResults <- myResults[myResults$FDR < 0.05 &
myResults$NGenes < 100 &
abs(myResults$aveLog2FC) > 0.1, ]
dim(myResults)
## [1] 129 11
geneSets <- updatedGO[row.names(myResults)]
n <- length(geneSets)
w <- matrix(NA, nrow = n, ncol = n)
colnames(w) <- rownames(w) <- names(geneSets)
for (i in seq_len(n - 1)) {
for (j in (i + 1):n) {
w[i, j] <- length(intersect(geneSets[[i]], geneSets[[j]]))/length(unique(c(geneSets[[i]], geneSets[[j]])))
}
}
pair_sim <- w[row.names(myResults), row.names(myResults)]
row.names(myResults) <- str_to_title(myResults$TERM)
colnames(pair_sim) <- rownames(pair_sim) <- row.names(myResults)
g <- prepare_graph(myResults,
min_edge = 0.1,
pair_sim = pair_sim,
NGenes = "NGenes",
colVar = "aveLog2FC",
layout = "nicely")
V(g)$name[nchar(V(g)$name) > 50] <- paste0(substr(V(g)$name[nchar(V(g)$name) > 50], 1, 47), "...")
set.seed(1234) ## because graph plotting function is not deterministic
fs4p2 <- ggraph(g, layout="nicely") +
geom_edge_link(aes(edge_width = width),
alpha=1, colour = "darkgrey") +
geom_node_point(aes(size = size,
fill = color),
shape = 21, color="grey10") +
geom_node_text(aes(label=name),
size = 3, bg.color = "white", repel=TRUE) +
scale_size_continuous(name = "Number of genes", limits = c(10,100), range=c(1,10)) +
scale_edge_width_continuous(name = "Jaccard overlap", limits = c(0.1,1), range = c(0.1, 2)) +
scale_fill_gradient2(name = "aveLog2FC", low = myPalette[2], high = myPalette[1]) +
coord_equal() +
theme_void()
fs4p2
Inference of cell type abundances with xCell, available
here: https://github.com/dviraran/xCell
# Normalizing to gene length is required. We use the total length of genomic regions used for counting by FeatureCounts to calculate RPKMs
exprMatrix <- log2cpm.loess - log2( dge.subset$genes$LENGTH / 1000 )
row.names(exprMatrix) <- dge.subset$genes$SYMBOL
## Remove duplicated symbols
exprMatrix <- exprMatrix[ !duplicated(row.names(exprMatrix)), ]
## Filtering of cell types:
# - we do not run on cell types which are not expected in blood
# - we excluded as a 2nd step the cell types which gave enrichment scores ~ 0 in almost all samples (Tgd cells, CD4+ naive T-cells and NK cells)
set.seed(1234)
xCellresults <- xCellAnalysis(exprMatrix, cell.types.use = c("B-cells", "naive B-cells", "Memory B-cells",
"Class-switched memory B-cells","Plasma cells",
"CD4+ T-cells", "CD4+ memory T-cells", "CD4+ Tcm", "CD4+ Tem",
"CD8+ T-cells", "CD8+ naive T-cells", "CD8+ Tcm", "CD8+ Tem",
"Th2 cells", "Th1 cells", "Tregs",
"NKT",
"Monocytes", "Macrophages", "Macrophages M1", "Macrophages M2",
"DC", "aDC", "cDC", "iDC", "pDC",
"Neutrophils",
"Erythrocytes", "Platelets", "Megakaryocytes",
"Basophils", "Eosinophils", "Mast cells"))
## Significance testing with limma
all(colnames(dge.subset) == colnames(xCellresults))
fit.ct <- lmFit(xCellresults, moma)
fit2.ct <- contrasts.fit(fit.ct, contrasts.matrix)
fit2.ct <- eBayes(fit2.ct, trend=TRUE, robust=TRUE)
## Only 3 cell types significant at FDR=5%. Use a more lenient FDR cutoff of 20%
table(de <- decideTests(fit2.ct, p.value = 0.2))
top <- topTable(fit2.ct, coef=1, p.value=0.2, n=10, sort.by = "P")
top
## Plot the significant cell types
myDf <- xCellresults[row.names(xCellresults) %in% row.names(top), ] |>
as_tibble(rownames = NA) |>
rownames_to_column() |>
dplyr::rename("Cell Type" = rowname) |>
pivot_longer(cols= colnames(xCellresults),
names_to = "Sample",
values_to = "Score") |>
dplyr::left_join(y=as_tibble(dge.subset$samples), by = join_by("Sample"))
## Reorder levels
myDf$Patient.subgroup <- factor(myDf$Patient.subgroup, levels = c("No diabetes", "Insulin dependent T2D / No remission", "T2D / No remission", "T2D / Remission", "Unknown"))
## Reorder of cell types according to direction and significance
myDf$`Cell Type` <- factor(myDf$`Cell Type`, levels=c(row.names(top)[top$logFC > 0], row.names(top)[top$logFC < 0]))
f5 <- ggplot(myDf, aes(x=Visit, y=Score, group=Patient, col=Patient.subgroup)) +
facet_wrap( ~ `Cell Type`, scales = "free_y", ncol = 5) +
geom_line(position = position_dodge(0.2), alpha = .5) +
geom_point(position = position_dodge(0.2), alpha = .8, size=2) +
scale_colour_manual(name="Patient subgroup", values=setNames(myPalette[c(9,5,6,7,8)], c("No diabetes", "Insulin dependent T2D / No remission", "T2D / No remission", "T2D / Remission", "unknown"))) +
scale_y_continuous(limits = c(0, NA), expand = expansion(mult = c(0, .05))) +
ylab(bquote('Enrichment score')) +
xlab("") +
myTheme +
theme(strip.background = element_blank(),
strip.text.x = element_text(size = 14, vjust = 0, face = "bold"),
strip.clip = "off",
panel.spacing.x = unit(3, "points"),
panel.spacing.y = unit(0, "points"),
axis.text.y = element_text(size = 8),
legend.position="bottom")
f5
https://bmcmedgenomics.biomedcentral.com/articles/10.1186/s12920-015-0141-x
## Download Additional file 2 from paper
if(!file.exists("data/Homuth.xlsx")){
download.file("https://static-content.springer.com/esm/art%3A10.1186%2Fs12920-015-0141-x/MediaObjects/12920_2015_141_MOESM2_ESM.xlsx", destfile = "data/Homuth.xlsx")
}
## Supplementary Table 34: Results of sensitivity analyses in KORA F4 "BASIC MODEL: Adjustment for age, sex, technical covariates, and blood cell counts"
tab <- readxl::read_excel("data/Homuth.xlsx", sheet = 36, skip = 1)
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `n total` -> `n total...7`
## • `Effect` -> `Effect...8`
## • `SE` -> `SE...9`
## • `p-value` -> `p-value...10`
## • `p-value (BH)` -> `p-value (BH)...11`
## • `n total` -> `n total...12`
## • `Effect` -> `Effect...13`
## • `SE` -> `SE...14`
## • `p-value` -> `p-value...15`
## • `p-value (BH)` -> `p-value (BH)...16`
## • `n total` -> `n total...17`
## • `Effect` -> `Effect...18`
## • `SE` -> `SE...19`
## • `p-value` -> `p-value...20`
## • `p-value (BH)` -> `p-value (BH)...21`
## Refresh illumina array annotation
tab$GENEID <- AnnotationDbi::mapIds(illuminaHumanv3.db, keys = tab$`...2`, keytype = "PROBEID", column = "ENSEMBL", multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
## Average effect size of probes matching to same Ensembl gene
tab <- tab[!is.na(tab$GENEID), ]
tab <- avereps(tab$`Effect...8`, tab$GENEID)
colnames(tab)[1] <- "effect_size"
## Load our data
top.x <- topTable(fit2, coef="Post_vs_pre", sort.by = "none", n=Inf)
## Merge
myDf <- merge(top.x, tab, by.x="GENEID", by.y=0)
## Total least squares regression
v <- prcomp(myDf[, c("logFC", "effect_size")])
slp <- v$rotation[2,1] / v$rotation[1,1]
int <- v$center[2] - slp*v$center[1]
## Labeling specific categories of genes: we use the clusters of proteins (encoded by significantly down-regulated genes) from the STRINGdb analysis (Figure S3)
stringClusters <- read.table("data/string_MCL_clusters.tsv", h=T, sep="\t", quote="", comment.char = "")
myDf$highlight <- NA
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 4]] <- "NFKB"
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 7]] <- "Neutrophils"
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 8]] <- "Erythrocytes"
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 12]] <- "Hemoglobin"
## We add all other hemoglobin subunits
myDf$highlight[grep("^hemoglobin subunit", myDf$DESCRIPTION)] <- "Hemoglobin"
myDf$highlight[grep("^MT-", myDf$SYMBOL)] <- "Mitochondrial"
table(myDf$highlight)
##
## Erythrocytes Hemoglobin Neutrophils NFKB
## 13 12 12 15
myDf$highlight <- factor(myDf$highlight, levels = c("Neutrophils", "NFKB", "Erythrocytes", "Hemoglobin", "Mitochondrial"))
myDf <- myDf[order(myDf$highlight), ]
myDf <- rbind(myDf[is.na(myDf$highlight), ],
myDf[!is.na(myDf$highlight), ])
## Define colors to be used
myColorNeutro <- myPalette[3]
myColorMito <- myPalette[4]
myColorHemo <- myPalette[5]
myColorErythro <- myPalette[6]
myColorNfkb <- myPalette[7]
f6p1 <- ggplot(myDf, aes(x=logFC, y=effect_size)) +
ylab(bquote('Effect size')) +
xlab(bquote('log'[2]~'Fold-Change')) +
geom_point(aes(color=highlight, alpha=highlight, size=highlight), shape=16) +
geom_density_2d(col="grey30") +
## Correlation coefficient
stat_cor(label.x = 2.12, label.y = 0.077, aes(label = after_stat(r.label)), size=7, hjust=1) +
## Regression line slope and intercept
geom_abline(slope=slp, intercept=int, linewidth = 1, color="grey30") +
## Fix limits
xlim(c(-2.12, 2.12)) +
ylim(c(-0.077, 0.077)) +
scale_color_manual(name="",
limits=sort(unique(myDf$highlight)),
values=unname(c(myColorNeutro, myColorNfkb, myColorErythro, myColorHemo, myColorMito)),
na.value = "grey80") +
scale_size_manual(values=c(2,2,2,2,2), guide="none", na.value = 1) +
scale_alpha_manual(values=c(1,1,1,1,1), guide="none", na.value = 0.5) +
annotate(geom = "text", x=-2.12, y=0, label="Effect of BMI (Homuth et al.)", angle = 90, size=5, fontface = 'italic') +
annotate(geom = "text", x=0, y=-0.077, label="V4 vs. V1 (this study)", size=5, fontface = 'italic') +
myTheme
f6p1
https://genesandnutrition.biomedcentral.com/articles/10.1186/s12263-021-00702-7
## Load DE analysis file obtained from the authors upon request
tab <- read.table("data/Kalafati.txt", sep="\t", h=T, row.names = 1)
## Load our data
top.x <- topTable(fit2, coef="Post_vs_pre", sort.by = "none", n=Inf)
## Merge
myDf <- merge(top.x, tab, by.x="GENEID", by.y=0)
## One lncRNA behaves like an outlier, with log2FC of 18.4, let's filter it out
myDf <- myDf[myDf$GENEID != "ENSG00000250302", ]
## Total least squares regression
v <- prcomp(myDf[, c("logFC", "log2FoldChange")])
slp <- v$rotation[2,1] / v$rotation[1,1]
int <- v$center[2] - slp*v$center[1]
## Labeling specific categories of genes
myDf$highlight <- NA
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 4]] <- "NFKB"
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 7]] <- "Neutrophils"
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 8]] <- "Erythrocytes"
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 12]] <- "Hemoglobin"
## We add all other hemoglobin subunits
myDf$highlight[grep("^hemoglobin subunit", myDf$DESCRIPTION)] <- "Hemoglobin"
myDf$highlight[grep("^MT-", myDf$SYMBOL)] <- "Mitochondrial"
table(myDf$highlight)
##
## Erythrocytes Hemoglobin Neutrophils NFKB
## 12 12 13 14
myDf$highlight <- factor(myDf$highlight, levels = c("Neutrophils", "NFKB", "Erythrocytes", "Hemoglobin", "Mitochondrial"))
myDf <- myDf[order(myDf$highlight), ]
myDf <- rbind(myDf[is.na(myDf$highlight), ],
myDf[!is.na(myDf$highlight), ])
f6p2 <- ggplot(myDf, aes(x=logFC, y=log2FoldChange)) +
ylab(bquote('log'[2]~'Fold-Change')) +
xlab(bquote('log'[2]~'Fold-Change')) +
geom_point(aes(color=highlight, alpha=highlight, size=highlight), shape=16) +
geom_density_2d(col="grey30") +
## Correlation coefficient
stat_cor(label.x = 2.12, label.y = 3, aes(label = after_stat(r.label)), size=7, hjust=1) +
## Regression line slope and intercept
geom_abline(slope=slp, intercept=int, linewidth = 1, color="grey30") +
## Fix limits
xlim(c(-2.12, 2.12)) +
ylim(c(-3, 3)) +
scale_color_manual(name="",
limits=sort(unique(myDf$highlight)),
values=unname(c(myColorNeutro, myColorNfkb, myColorErythro, myColorHemo, myColorMito)),
na.value = "grey80") +
scale_size_manual(values=c(2,2,2,2,2), guide="none", na.value = 1) +
scale_alpha_manual(values=c(1,1,1,1,1), guide="none", na.value = 0.5) +
annotate(geom = "text", x=-2.12, y=0, label="Insulin-resistant vs. sensitive (Kalafati et al.)", angle = 90, size=5, fontface = 'italic') +
annotate(geom = "text", x=0, y=-3, label="V4 vs. V1 (this study)", size=5, fontface = 'italic') +
myTheme
f6p2
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_density2d()`).
## Warning: Removed 1 row containing non-finite outside the scale range (`stat_cor()`).
## Warning: Removed 1 row containing missing values or values outside the scale range (`geom_point()`).
https://www.sciencedirect.com/science/article/pii/S2666379121003499
## Download Table S5F from paper
if(!file.exists("data/Wesolowska-Andersen.xlsx")){
download.file("https://ars.els-cdn.com/content/image/1-s2.0-S2666379121003499-mmc6.xlsx", destfile = "data/Wesolowska-Andersen.xlsx")
}
## Focus on archetype B
tab <- readxl::read_excel("data/Wesolowska-Andersen.xlsx", sheet = 6, skip = 2)
## Process the gene IDs
tab$GENEID <- gsub("\\.\\d+$", "", tab$Transcript)
## Load our data
top.x <- topTable(fit2, coef="Post_vs_pre", sort.by = "none", n=Inf)
## Merge
myDf <- merge(top.x, tab, by="GENEID")
## Total least squares regression
v <- prcomp(myDf[, c("logFC", "B: estimate")])
slp <- v$rotation[2,1] / v$rotation[1,1]
int <- v$center[2] - slp*v$center[1]
## Labeling specific categories of genes
myDf$highlight <- NA
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 4]] <- "NFKB"
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 7]] <- "Neutrophils"
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 8]] <- "Erythrocytes"
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 12]] <- "Hemoglobin"
## We add all other hemoglobin subunits
myDf$highlight[grep("^hemoglobin subunit", myDf$DESCRIPTION)] <- "Hemoglobin"
myDf$highlight[grep("^MT-", myDf$SYMBOL)] <- "Mitochondrial"
table(myDf$highlight)
##
## Erythrocytes Hemoglobin Neutrophils NFKB
## 13 11 12 14
myDf$highlight <- factor(myDf$highlight, levels = c("Neutrophils", "NFKB", "Erythrocytes", "Hemoglobin", "Mitochondrial"))
myDf <- myDf[order(myDf$highlight), ]
myDf <- rbind(myDf[is.na(myDf$highlight), ],
myDf[!is.na(myDf$highlight), ])
f6p3 <- ggplot(myDf, aes(x=logFC, y=`B: estimate`)) +
ylab(bquote('Effect size')) +
xlab(bquote('log'[2]~'Fold-Change')) +
geom_point(aes(color=highlight, alpha=highlight, size=highlight), shape=16) +
geom_density_2d(col="grey30") +
## Correlation coefficient
stat_cor(label.x = 2.12, label.y = 0.04, aes(label = after_stat(r.label)), size=7, hjust=1) +
## Regression line slope and intercept
geom_abline(slope=slp, intercept=int, linewidth = 1, color="grey30") +
## Fix limits
xlim(c(-2.12, 2.12)) +
ylim(c(-0.04, 0.04)) +
scale_color_manual(name="",
limits=sort(unique(myDf$highlight)),
values=unname(c(myColorNeutro, myColorNfkb, myColorErythro, myColorHemo, myColorMito)),
na.value = "grey80") +
scale_size_manual(values=c(2,2,2,2,2), guide="none", na.value = 1) +
scale_alpha_manual(values=c(1,1,1,1,1), guide="none", na.value = 0.5) +
annotate(geom = "text", x=-2.12, y=0, label="T2D archetype B (Wesolowska-Andersen et al.)", angle = 90, size=5, fontface = 'italic') +
annotate(geom = "text", x=0, y=-0.04, label="V4 vs. V1 (this study)", size=5, fontface = 'italic') +
myTheme
f6p3
https://link.springer.com/article/10.1007/s00125-023-05886-8
## Download Supp Table of paper
if(!file.exists("data/deKlerk.xlsx")){
download.file("https://static-content.springer.com/esm/art%3A10.1007%2Fs00125-023-05886-8/MediaObjects/125_2023_5886_MOESM2_ESM.xlsx", destfile = "data/deKlerk.xlsx")
}
## Genes DE in MOD T2D cluster
tab <- readxl::read_excel(path="data/deKlerk.xlsx", sheet = 6, range = "O2:T7931")
## Load our data
top.x <- topTable(fit2, coef="Post_vs_pre", sort.by = "none", n=Inf)
## Merge
myDf <- merge(top.x, tab, by.x="SYMBOL", by.y="Gene name")
## Total least squares regression
v <- prcomp(myDf[, c("logFC.x", "logFC.y")])
slp <- v$rotation[2,1] / v$rotation[1,1]
int <- v$center[2] - slp*v$center[1]
## Labeling specific categories of genes
myDf$highlight <- NA
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 4]] <- "NFKB"
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 7]] <- "Neutrophils"
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 8]] <- "Erythrocytes"
myDf$highlight[myDf$SYMBOL %in% stringClusters$protein.name[stringClusters$cluster.number == 12]] <- "Hemoglobin"
## We add all other hemoglobin subunits
myDf$highlight[grep("^hemoglobin subunit", myDf$DESCRIPTION)] <- "Hemoglobin"
myDf$highlight[grep("^MT-", myDf$SYMBOL)] <- "Mitochondrial"
table(myDf$highlight)
##
## Erythrocytes Hemoglobin Mitochondrial Neutrophils NFKB
## 7 3 6 7 9
myDf$highlight <- factor(myDf$highlight, levels = c("Neutrophils", "NFKB", "Erythrocytes", "Hemoglobin", "Mitochondrial"))
myDf <- myDf[order(myDf$highlight), ]
myDf <- rbind(myDf[is.na(myDf$highlight), ],
myDf[!is.na(myDf$highlight), ])
f6p4 <- ggplot(myDf, aes(x=logFC.x, y=logFC.y)) +
ylab(bquote('log'[2]~'Fold-Change')) +
xlab(bquote('log'[2]~'Fold-Change')) +
geom_point(aes(color=highlight, alpha=highlight, size=highlight), shape=16) +
geom_density_2d(col="grey30") +
## Correlation coefficient
stat_cor(label.x = 2.12, label.y = 1, aes(label = after_stat(r.label)), size=7, hjust=1) +
## Regression line slope and intercept
geom_abline(slope=slp, intercept=int, linewidth = 1, color="grey30") +
## Fix limits
xlim(c(-2.12, 2.12)) +
ylim(c(-1, 1)) +
scale_color_manual(name="",
limits=sort(unique(myDf$highlight)),
values=unname(c(myColorNeutro, myColorNfkb, myColorErythro, myColorHemo, myColorMito)),
na.value = "grey80") +
scale_size_manual(values=c(2,2,2,2,2), guide="none", na.value = 1) +
scale_alpha_manual(values=c(1,1,1,1,1), guide="none", na.value = 0.5) +
annotate(geom = "text", x=-2.12, y=0, label="Mild obesity-related T2D (de Klerk et al.)", angle = 90, size=5, fontface = 'italic') +
annotate(geom = "text", x=0, y=-1, label="V4 vs. V1 (this study)", size=5, fontface = 'italic') +
myTheme
f6p4
https://www.frontiersin.org/journals/endocrinology/articles/10.3389/fendo.2023.1049484/full
The dataset was downloaded from SRA (BioProject ID PRJNA861382) and reprocessed similar to our dataset (STAR + featureCounts).
dge.liu <- readRDS("data/Liu_DGEList.rds")
dge.liu$samples
## group lib.size norm.factors Donor sex
## SRR20647845 post 22207976 0.9380378 Patient_8 Female
## SRR20647846 post 21164241 0.8767257 Patient_7 Female
## SRR20647854 pre 22508471 0.8465755 Patient_1 Female
## SRR20647836 pre 18414447 1.0485288 Patient_9 Female
## SRR20647841 pre 21166742 1.0296887 Patient_4 Female
## SRR20647847 post 18265668 1.0861937 Patient_6 Female
## SRR20647849 post 19149312 1.0266193 Patient_4 Female
## SRR20647843 post 20721058 0.9899505 Patient_10 Female
## SRR20647835 pre 19499648 1.0387250 Patient_10 Female
## SRR20647838 pre 22499441 0.9389552 Patient_7 Female
## SRR20647842 pre 23128101 1.0125301 Patient_3 Male
## SRR20647844 post 17560932 1.0830938 Patient_9 Female
## SRR20647851 post 15150392 0.9724933 Patient_2 Male
## SRR20647850 post 18867523 0.9853841 Patient_3 Male
## SRR20647837 pre 20081873 0.8965124 Patient_8 Female
## SRR20647839 pre 22766645 1.0812238 Patient_6 Female
## SRR20647840 pre 22421969 1.0192733 Patient_5 Female
## SRR20647848 post 18032982 1.0404871 Patient_5 Female
## SRR20647852 post 16732593 1.0517515 Patient_1 Female
## SRR20647853 pre 19830022 1.0874513 Patient_2 Male
## Filter lowly expressed genes
keep <- rowSums(edgeR::cpm(dge.liu) > 1) >= 3
## Exclude LRG loci
keep <- keep & !dge.liu$genes$GENEBIOTYPE %in% c("LRG_gene")
sum(keep)
## [1] 18648
dge.liu <- dge.liu[keep, , keep.lib.sizes=FALSE]
dge.liu <- calcNormFactors(dge.liu)
## design matrix
moma.liu <- model.matrix(~ 0 + group + Donor, data=dge.liu$samples)
colnames(moma.liu) <- gsub("group", "", colnames(moma.liu))
## Contrast matrix
contrasts.matrix.liu <- makeContrasts(
Post_vs_pre = post - pre,
levels=moma.liu
)
v <- voom(dge.liu, moma.liu, plot=TRUE, normalize.method = "cyclicloess")
fit <- lmFit(v, moma.liu)
fit2.liu <- contrasts.fit(fit, contrasts.matrix.liu)
fit2.liu <- eBayes(fit2.liu, robust=TRUE)
table(de <- decideTests(fit2.liu, p.value = 0.05))
##
## -1 0 1
## 170 18187 291
topTable(fit2.liu, coef=1, p.value=0.05, n=10, sort.by = "P")
## source SYMBOL DESCRIPTION GENEBIOTYPE ENTREZID
## ENSG00000165997 ensdb_110 ARL5B ADP ribosylation factor like GTPase 5B protein_coding 221079
## ENSG00000040633 ensdb_110 PHF23 PHD finger protein 23 protein_coding 79142
## ENSG00000261026 ensdb_110 novel transcript, overlapping to EGR3 lncRNA <NA>
## ENSG00000130600 ensdb_110 H19 H19 imprinted maternally expressed transcript lncRNA 283120
## ENSG00000118503 ensdb_110 TNFAIP3 TNF alpha induced protein 3 protein_coding 7128
## ENSG00000019186 ensdb_110 CYP24A1 cytochrome P450 family 24 subfamily A member 1 protein_coding 1591
## ENSG00000279192 ensdb_110 PWAR5 Prader Willi/Angelman region RNA 5 TEC <NA>
## ENSG00000115009 ensdb_110 CCL20 C-C motif chemokine ligand 20 protein_coding 6364
## ENSG00000249138 ensdb_110 SLED1 proteoglycan 3, pro eosinophil major basic protein 2 pseudogene transcribed_processed_pseudogene 643036
## ENSG00000171940 ensdb_110 ZNF217 zinc finger protein 217 protein_coding 7764
## UNIPROT LENGTH GENEID METABIOTYPE logFC AveExpr t P.Value adj.P.Val B
## ENSG00000165997 B0YIW9 22209 ENSG00000165997 protein_coding 1.6671176 5.9175446 11.541252 4.712802e-08 0.0008788433 9.005843
## ENSG00000040633 Q9BUL5 4694 ENSG00000040633 protein_coding -0.6102185 4.5412871 -10.575801 1.283630e-07 0.0011968568 8.045979
## ENSG00000261026 <NA> 4997 ENSG00000261026 long_non_coding 3.8880188 -0.2657712 9.984821 2.460841e-07 0.0015296585 4.602253
## ENSG00000130600 <NA> 9387 ENSG00000130600 long_non_coding 5.3886969 -1.0689809 9.708224 4.621405e-07 0.0018464388 6.131134
## ENSG00000118503 Q5VXR0 16101 ENSG00000118503 protein_coding 2.4512651 9.2667332 9.645060 4.959199e-07 0.0018464388 6.771843
## ENSG00000019186 Q07973 20541 ENSG00000019186 protein_coding -4.1040213 -1.6357374 -9.226407 5.940923e-07 0.0018464388 3.924560
## ENSG00000279192 <NA> 3180 ENSG00000279192 <NA> 0.7557717 3.1271397 9.042741 7.416132e-07 0.0019756576 6.285701
## ENSG00000115009 A0A2R8Y806 11826 ENSG00000115009 protein_coding 4.0117225 1.1834510 8.716809 1.461069e-06 0.0027041740 4.929147
## ENSG00000249138 <NA> 751 ENSG00000249138 pseudogene 3.6836561 -0.4619774 8.484649 1.486413e-06 0.0027041740 3.654499
## ENSG00000171940 O75362 42837 ENSG00000171940 protein_coding -0.5928820 6.9140751 -8.462452 1.529141e-06 0.0027041740 5.638985
## Comparison of fold changes ##################################################
top.x <- topTable(fit2, coef="Post_vs_pre", sort.by = "none", n=Inf)
top.y <- topTable(fit2.liu, coef="Post_vs_pre", sort.by = "none", n=Inf)
myDf <- merge(top.x, top.y, by="GENEID")
## Total least squares regression
v <- prcomp(myDf[, c("logFC.x", "logFC.y")])
slp <- v$rotation[2,1] / v$rotation[1,1]
int <- v$center[2] - slp*v$center[1]
eq <- as.expression(substitute(italic(y) == b %.% italic(x) + a,
list(a = format(unname(int), digits = 2),
b = format(unname(slp), digits = 2))))
## Labeling specific categories of genes
myDf$highlight <- NA
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 4]] <- "NFKB"
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 7]] <- "Neutrophils"
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 8]] <- "Erythrocytes"
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 12]] <- "Hemoglobin"
## We add all other hemoglobin subunits
myDf$highlight[grep("^hemoglobin subunit", myDf$DESCRIPTION)] <- "Hemoglobin"
myDf$highlight[grep("^MT-", myDf$SYMBOL.x)] <- "Mitochondrial"
table(myDf$highlight)
##
## Erythrocytes Hemoglobin Mitochondrial Neutrophils NFKB
## 11 4 34 12 15
myDf$highlight <- factor(myDf$highlight, levels = c("Neutrophils", "NFKB", "Erythrocytes", "Hemoglobin", "Mitochondrial"))
myDf <- myDf[order(myDf$highlight), ]
myDf <- rbind(myDf[is.na(myDf$highlight), ],
myDf[!is.na(myDf$highlight), ])
f7p1 <- ggplot(myDf, aes(x=logFC.x, y=logFC.y)) +
ylab(bquote('log'[2]~'Fold-Change')) +
xlab(bquote('log'[2]~'Fold-Change')) +
geom_point(aes(color=highlight, alpha=highlight, size=highlight), shape=16) +
geom_density_2d(col="grey30") +
## Correlation coefficient
stat_cor(label.x = -3.65, label.y = 3.65, aes(label = after_stat(r.label)), size=7, hjust=0) +
## Regression line slope and intercept
geom_abline(slope=slp, intercept=int, linewidth = 1, color="grey30") +
annotate(geom="text", x=-3.65, y=3.65-0.53,
label=eq, parse = TRUE, # See above
size=7, hjust=0, vjust=0) +
## Fix limits so that range is equal on x and y axes
xlim(c(-3.65, 3.65)) +
ylim(c(-3.65, 3.65)) +
scale_color_manual(name="",
limits=sort(unique(myDf$highlight)),
values=unname(c(myColorNeutro, myColorNfkb, myColorErythro, myColorHemo, myColorMito)),
na.value = "grey80") +
scale_size_manual(values=c(2,2,2,2,2), guide="none", na.value = 1) +
scale_alpha_manual(values=c(1,1,1,1,1), guide="none", na.value = 0.5) +
annotate(geom = "text", x=-3.65, y=0, label="1 month post vs. pre-surgery (Liu et al.)", angle = 90, size=5, fontface = 'italic') +
annotate(geom = "text", x=0, y=-3.65, label="V4 vs. V1 (this study)", size=5, fontface = 'italic') +
myTheme
f7p1
https://academic.oup.com/jes/article/8/1/bvad159/7484625
## Raw .CEL files are available from GEO under accession GSE271700
if(!dir.exists("data/GSE271700_RAW/")){
download.file("https://ftp.ncbi.nlm.nih.gov/geo/series/GSE271nnn/GSE271700/suppl/GSE271700_RAW.tar", destfile = "data/GSE271700_RAW.tar")
untar("data/GSE271700_RAW.tar", exdir = "data/GSE271700_RAW/")
file.remove("data/GSE271700_RAW.tar")
}
## Read-in and MA normalization
eset <- justRMA(celfile.path="data/GSE271700_RAW/")
##
fData(eset)$GENEID <- AnnotationDbi::mapIds(hgu133plus2.db, keys = row.names(eset), keytype = "PROBEID", column = "ENSEMBL", multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
fData(eset)$SYMBOL <- AnnotationDbi::mapIds(hgu133plus2.db, keys = row.names(eset), keytype = "PROBEID", column = "SYMBOL", multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
## Filter out probes without an Ensembl ID
eset <- eset[!is.na(fData(eset)$GENEID)]
## Average expression of probes matching to same Ensembl gene
eset <- ExpressionSet(assayData = avereps(eset, ID=fData(eset)$GENEID),
phenoData = phenoData(eset))
fData(eset)$GENEID <- row.names(eset)
fData(eset)$SYMBOL <- mapIds(edb, key=row.names(eset), keytype="GENEID", column="SYMBOL")
fData(eset)$GENENAME <- mapIds(edb, key=row.names(eset), keytype="GENEID", column="DESCRIPTION")
## Sample info
pData(eset)$visit <- factor(strsplit2(colnames(eset), split = "_")[, 2])
pData(eset)$Donor <- factor(strsplit2(colnames(eset), split = "_")[, 3])
## normalization
exprs(eset) <- normalizeBetweenArrays(exprs(eset), method = "cyclicloess")
plotDensities(exprs(eset), group = pData(eset)$visit, legend="right")
## DE analysis #################################################################
## Pre vs. after 2 months
eset.subset <- eset[, pData(eset)$visit %in% c("T0", "T1")]
## Remove unpaired donors
eset.subset <- eset.subset[, pData(eset.subset)$Donor %in% names(which(table(pData(eset.subset)$Donor) == 2))]
dim(eset.subset)
## Features Samples
## 20202 46
pData(eset.subset)$Donor <- factor(pData(eset.subset)$Donor)
pData(eset.subset)$visit <- factor(pData(eset.subset)$visit)
## design matrix: for now ignore subgroup
moma.rashid <- model.matrix(~ 0 + visit + Donor, data=pData(eset.subset))
colnames(moma.rashid) <- gsub("visit", "", colnames(moma.rashid))
## Contrast matrix
contrasts.matrix.rashid <- makeContrasts(
Post_vs_pre = T1 - T0,
levels=moma.rashid
)
fit <- lmFit(eset.subset, moma.rashid)
keep <- fit$Amean > 4
fit2.rashid.2months <- contrasts.fit(fit[keep, ], contrasts.matrix.rashid)
fit2.rashid.2months <- eBayes(fit2.rashid.2months, trend = TRUE, robust=TRUE)
table(de <- decideTests(fit2.rashid.2months, p.value = 0.05))
##
## 0
## 15519
# Top genes
topTable(fit2.rashid.2months, coef=1, p.value=1, n=10, sort.by = "P")
## GENEID SYMBOL GENENAME logFC
## ENSG00000206047 ENSG00000206047 DEFA1 defensin alpha 1 [Source:HGNC Symbol;Acc:HGNC:2761] -1.6568855
## ENSG00000121634 ENSG00000121634 GJA8 gap junction protein alpha 8 [Source:HGNC Symbol;Acc:HGNC:4281] -0.3013184
## ENSG00000152292 ENSG00000152292 SH2D6 SH2 domain containing 6 [Source:HGNC Symbol;Acc:HGNC:30439] -0.3743990
## ENSG00000126790 ENSG00000126790 L3HYPDH trans-L-3-hydroxyproline dehydratase [Source:HGNC Symbol;Acc:HGNC:20488] 0.4584110
## ENSG00000117569 ENSG00000117569 PTBP2 polypyrimidine tract binding protein 2 [Source:HGNC Symbol;Acc:HGNC:17662] 0.4918848
## ENSG00000166432 ENSG00000166432 ZMAT1 zinc finger matrin-type 1 [Source:HGNC Symbol;Acc:HGNC:29377] 0.5366304
## ENSG00000185619 ENSG00000185619 PCGF3 polycomb group ring finger 3 [Source:HGNC Symbol;Acc:HGNC:10066] 0.3483125
## ENSG00000206530 ENSG00000206530 CFAP44 cilia and flagella associated protein 44 [Source:HGNC Symbol;Acc:HGNC:25631] 0.3579104
## ENSG00000003509 ENSG00000003509 NDUFAF7 NADH:ubiquinone oxidoreductase complex assembly factor 7 [Source:HGNC Symbol;Acc:HGNC:28816] 0.3566390
## ENSG00000262223 ENSG00000262223 AC110285.1 novel transcript -0.3090207
## AveExpr t P.Value adj.P.Val B
## ENSG00000206047 8.097970 -5.524585 6.960388e-06 0.1080183 3.4342260
## ENSG00000121634 5.019012 -4.916501 3.556657e-05 0.1400654 2.0833849
## ENSG00000152292 4.864189 -4.721844 6.133142e-05 0.1400654 1.6290041
## ENSG00000126790 4.422412 4.634457 7.773531e-05 0.1400654 1.4307382
## ENSG00000117569 5.131354 4.626606 7.940756e-05 0.1400654 1.4129216
## ENSG00000166432 4.438956 4.488288 1.154925e-04 0.1400654 1.0990254
## ENSG00000185619 5.321044 4.459633 1.233731e-04 0.1400654 1.0430452
## ENSG00000206530 4.123562 4.462549 1.238193e-04 0.1400654 1.0406300
## ENSG00000003509 5.379055 4.413380 1.403105e-04 0.1400654 0.9352602
## ENSG00000262223 5.152049 -4.307382 1.860634e-04 0.1400654 0.6979276
## Same for pre vs. after 1 year ###############################################
eset.subset <- eset[, pData(eset)$visit %in% c("T0", "T2")]
## Remove unpaired donors
eset.subset <- eset.subset[, pData(eset.subset)$Donor %in% names(which(table(pData(eset.subset)$Donor) == 2))]
dim(eset.subset)
## Features Samples
## 20202 48
pData(eset.subset)$Donor <- factor(pData(eset.subset)$Donor)
pData(eset.subset)$visit <- factor(pData(eset.subset)$visit)
## design matrix: for now ignore subgroup
moma.rashid <- model.matrix(~ 0 + visit + Donor, data=pData(eset.subset))
colnames(moma.rashid) <- gsub("visit", "", colnames(moma.rashid))
## Contrast matrix
contrasts.matrix.rashid <- makeContrasts(
Post_vs_pre = T2 - T0,
levels=moma.rashid
)
fit <- lmFit(eset.subset, moma.rashid)
keep <- fit$Amean > 4
fit2.rashid.1year <- contrasts.fit(fit[keep, ], contrasts.matrix.rashid)
fit2.rashid.1year <- eBayes(fit2.rashid.1year, trend = TRUE, robust=TRUE)
table(de <- decideTests(fit2.rashid.1year, p.value = 0.05))
##
## 0
## 15551
## but many genes between are DE with an FDR between 5 and 10%
table(de <- decideTests(fit2.rashid.1year, p.value = 0.1))
##
## -1 0 1
## 1719 11814 2018
topTable(fit2.liu, coef=1, p.value=0.1, n=10, sort.by = "P")
## source SYMBOL DESCRIPTION GENEBIOTYPE ENTREZID
## ENSG00000165997 ensdb_110 ARL5B ADP ribosylation factor like GTPase 5B protein_coding 221079
## ENSG00000040633 ensdb_110 PHF23 PHD finger protein 23 protein_coding 79142
## ENSG00000261026 ensdb_110 novel transcript, overlapping to EGR3 lncRNA <NA>
## ENSG00000130600 ensdb_110 H19 H19 imprinted maternally expressed transcript lncRNA 283120
## ENSG00000118503 ensdb_110 TNFAIP3 TNF alpha induced protein 3 protein_coding 7128
## ENSG00000019186 ensdb_110 CYP24A1 cytochrome P450 family 24 subfamily A member 1 protein_coding 1591
## ENSG00000279192 ensdb_110 PWAR5 Prader Willi/Angelman region RNA 5 TEC <NA>
## ENSG00000115009 ensdb_110 CCL20 C-C motif chemokine ligand 20 protein_coding 6364
## ENSG00000249138 ensdb_110 SLED1 proteoglycan 3, pro eosinophil major basic protein 2 pseudogene transcribed_processed_pseudogene 643036
## ENSG00000171940 ensdb_110 ZNF217 zinc finger protein 217 protein_coding 7764
## UNIPROT LENGTH GENEID METABIOTYPE logFC AveExpr t P.Value adj.P.Val B
## ENSG00000165997 B0YIW9 22209 ENSG00000165997 protein_coding 1.6671176 5.9175446 11.541252 4.712802e-08 0.0008788433 9.005843
## ENSG00000040633 Q9BUL5 4694 ENSG00000040633 protein_coding -0.6102185 4.5412871 -10.575801 1.283630e-07 0.0011968568 8.045979
## ENSG00000261026 <NA> 4997 ENSG00000261026 long_non_coding 3.8880188 -0.2657712 9.984821 2.460841e-07 0.0015296585 4.602253
## ENSG00000130600 <NA> 9387 ENSG00000130600 long_non_coding 5.3886969 -1.0689809 9.708224 4.621405e-07 0.0018464388 6.131134
## ENSG00000118503 Q5VXR0 16101 ENSG00000118503 protein_coding 2.4512651 9.2667332 9.645060 4.959199e-07 0.0018464388 6.771843
## ENSG00000019186 Q07973 20541 ENSG00000019186 protein_coding -4.1040213 -1.6357374 -9.226407 5.940923e-07 0.0018464388 3.924560
## ENSG00000279192 <NA> 3180 ENSG00000279192 <NA> 0.7557717 3.1271397 9.042741 7.416132e-07 0.0019756576 6.285701
## ENSG00000115009 A0A2R8Y806 11826 ENSG00000115009 protein_coding 4.0117225 1.1834510 8.716809 1.461069e-06 0.0027041740 4.929147
## ENSG00000249138 <NA> 751 ENSG00000249138 pseudogene 3.6836561 -0.4619774 8.484649 1.486413e-06 0.0027041740 3.654499
## ENSG00000171940 O75362 42837 ENSG00000171940 protein_coding -0.5928820 6.9140751 -8.462452 1.529141e-06 0.0027041740 5.638985
## Comparison of fold changes: 2 months ########################################
top.x <- topTable(fit2, coef="Post_vs_pre", sort.by = "none", n=Inf)
top.y <- topTable(fit2.rashid.2months, coef="Post_vs_pre", sort.by = "none", n=Inf)
myDf <- merge(top.x, top.y, by="GENEID")
## Total least squares regression
v <- prcomp(myDf[, c("logFC.x", "logFC.y")])
slp <- v$rotation[2,1] / v$rotation[1,1]
int <- v$center[2] - slp*v$center[1]
eq <- as.expression(substitute(italic(y) == b %.% italic(x) + a,
list(a = format(unname(int), digits = 2),
b = format(unname(slp), digits = 2))))
## Labeling specific categories of genes
myDf$highlight <- NA
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 4]] <- "NFKB"
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 7]] <- "Neutrophils"
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 8]] <- "Erythrocytes"
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 12]] <- "Hemoglobin"
## We add all other hemoglobin subunits
myDf$highlight[grep("^hemoglobin subunit", myDf$DESCRIPTION)] <- "Hemoglobin"
myDf$highlight[grep("^MT-", myDf$SYMBOL.x)] <- "Mitochondrial"
table(myDf$highlight)
##
## Erythrocytes Hemoglobin Mitochondrial Neutrophils NFKB
## 12 12 8 10 15
myDf$highlight <- factor(myDf$highlight, levels = c("Neutrophils", "NFKB", "Erythrocytes", "Hemoglobin", "Mitochondrial"))
myDf <- myDf[order(myDf$highlight), ]
myDf <- rbind(myDf[is.na(myDf$highlight), ],
myDf[!is.na(myDf$highlight), ])
f7p2 <- ggplot(myDf, aes(x=logFC.x, y=logFC.y)) +
ylab(bquote('log'[2]~'Fold-Change')) +
xlab(bquote('log'[2]~'Fold-Change')) +
geom_point(aes(color=highlight, alpha=highlight, size=highlight), shape=16) +
geom_density_2d(col="grey30") +
## Correlation coefficient
stat_cor(label.x = -2.12, label.y = 2.12, aes(label = after_stat(r.label)), size=7, hjust=0) +
## Regression line slope and intercept
geom_abline(slope=slp, intercept=int, linewidth = 1, color="grey30") +
annotate(geom="text", x=-2.12, y=1.8,
label=eq, parse = TRUE, # See above
size=7, hjust=0, vjust=0) +
## Fix limits so that range is equal on x and y axes
xlim(c(-2.12, 2.12)) +
ylim(c(-2.12, 2.12)) +
scale_color_manual(name="",
limits=sort(unique(myDf$highlight)),
values=unname(c(myColorNeutro, myColorNfkb, myColorErythro, myColorHemo, myColorMito)),
na.value = "grey80") +
scale_size_manual(values=c(2,2,2,2,2), guide="none", na.value = 1) +
scale_alpha_manual(values=c(1,1,1,1,1), guide="none", na.value = 0.5) +
annotate(geom = "text", x=-2.12, y=0, label="2 months post vs. pre-surgery (Rashid et al.)", angle = 90, size=5, fontface = 'italic') +
annotate(geom = "text", x=0, y=-2.12, label="V4 vs. V1 (this study)", size=5, fontface = 'italic') +
myTheme
f7p2
## Comparison of fold changes: 1 year ########################################
top.x <- topTable(fit2, coef="Post_vs_pre", sort.by = "none", n=Inf)
top.y <- topTable(fit2.rashid.1year, coef="Post_vs_pre", sort.by = "none", n=Inf)
myDf <- merge(top.x, top.y, by="GENEID")
## Total least squares regression
v <- prcomp(myDf[, c("logFC.x", "logFC.y")])
slp <- v$rotation[2,1] / v$rotation[1,1]
int <- v$center[2] - slp*v$center[1]
eq <- as.expression(substitute(italic(y) == b %.% italic(x) + a,
list(a = format(unname(int), digits = 2),
b = format(unname(slp), digits = 2))))
## Labeling specific categories of genes
myDf$highlight <- NA
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 4]] <- "NFKB"
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 7]] <- "Neutrophils"
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 8]] <- "Erythrocytes"
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 12]] <- "Hemoglobin"
## We add all other hemoglobin subunits
myDf$highlight[grep("^hemoglobin subunit", myDf$DESCRIPTION)] <- "Hemoglobin"
myDf$highlight[grep("^MT-", myDf$SYMBOL.x)] <- "Mitochondrial"
table(myDf$highlight)
##
## Erythrocytes Hemoglobin Mitochondrial Neutrophils NFKB
## 12 12 8 10 15
myDf$highlight <- factor(myDf$highlight, levels = c("Neutrophils", "NFKB", "Erythrocytes", "Hemoglobin", "Mitochondrial"))
myDf <- myDf[order(myDf$highlight), ]
myDf <- rbind(myDf[is.na(myDf$highlight), ],
myDf[!is.na(myDf$highlight), ])
f7p4 <- ggplot(myDf, aes(x=logFC.x, y=logFC.y)) +
ylab(bquote('log'[2]~'Fold-Change')) +
xlab(bquote('log'[2]~'Fold-Change')) +
geom_point(aes(color=highlight, alpha=highlight, size=highlight), shape=16) +
geom_density_2d(col="grey30") +
## Correlation coefficient
stat_cor(label.x = -2.12, label.y = 2.12, aes(label = after_stat(r.label)), size=7, hjust=0) +
## Regression line slope and intercept
geom_abline(slope=slp, intercept=int, linewidth = 1, color="grey30") +
annotate(geom="text", x=-2.12, y=1.8,
label=eq, parse = TRUE, # See above
size=7, hjust=0, vjust=0) +
## Fix limits so that range is equal on x and y axes
xlim(c(-2.12, 2.12)) +
ylim(c(-2.12, 2.12)) +
scale_color_manual(name="",
limits=sort(unique(myDf$highlight)),
values=unname(c(myColorNeutro, myColorNfkb, myColorErythro, myColorHemo, myColorMito)),
na.value = "grey80") +
scale_size_manual(values=c(2,2,2,2,2), guide="none", na.value = 1) +
scale_alpha_manual(values=c(1,1,1,1,1), guide="none", na.value = 0.5) +
annotate(geom = "text", x=-2.12, y=0, label="12 months post vs. pre-surgery (Rashid et al.)", angle = 90, size=5, fontface = 'italic') +
annotate(geom = "text", x=0, y=-2.12, label="V4 vs. V1 (this study)", size=5, fontface = 'italic') +
myTheme
f7p4
https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0016729
# load series and platform data from GEO
gset <- getGEO("GSE19790", GSEMatrix =TRUE, AnnotGPL=TRUE)
## Found 1 file(s)
## GSE19790_series_matrix.txt.gz
gset <- gset[[1]]
## Update the probe annotation: Illumina human-6 v2.0 expression beadchip
fData(gset)$GENEID <- AnnotationDbi::mapIds(illuminaHumanv2.db, keys = row.names(gset), keytype = "PROBEID", column = "ENSEMBL", multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
fData(gset)$SYMBOL <- AnnotationDbi::mapIds(illuminaHumanv2.db, keys = row.names(gset), keytype = "PROBEID", column = "SYMBOL", multiVals = "first")
## 'select()' returned 1:many mapping between keys and columns
## Sample info
pData(gset)$group <- factor(gsub("Subject\\s\\d+\\s(pre|post)\\ssurgery" , "\\1", pData(gset)$title))
pData(gset)$Donor <- factor(gsub("Subject\\s(\\d+)\\s(pre|post)\\ssurgery" , "\\1", pData(gset)$title))
plotDensities(exprs(gset), group = pData(gset)$Donor, legend="right")
plotDensities(log2(exprs(gset)), group = pData(gset)$Donor, legend="right")
## Data was quantile normalized already, we simply need to log transform
exprs(gset) <- log2(exprs(gset))
## Filter out probes without an Ensembl ID
gset <- gset[!is.na(fData(gset)$GENEID)]
## Average expression of probes matching to same Ensembl ID
gset <- ExpressionSet(assayData = avereps(gset, ID=fData(gset)$GENEID),
phenoData = phenoData(gset))
fData(gset)$GENEID <- row.names(gset)
fData(gset)$SYMBOL <- mapIds(edb, key=row.names(gset), keytype="GENEID", column="SYMBOL")
fData(gset)$UNIPROT <- mapIds(edb, key=row.names(gset), keytype="GENEID", column="UNIPROTID")
fData(gset)$GENENAME <- mapIds(edb, key=row.names(gset), keytype="GENEID", column="DESCRIPTION")
## DE analysis #################################################################
## design matrix
moma.berisha <- model.matrix(~ 0 + group + Donor, data=pData(gset))
colnames(moma.berisha) <- gsub("group", "", colnames(moma.berisha))
## Contrast matrix
contrasts.matrix.berisha <- makeContrasts(
Post_vs_pre = post - pre,
levels=moma.berisha
)
fit <- lmFit(gset, moma.berisha)
keep <- fit$Amean > 6.5 # arbitrary filter on expression level
fit2.berisha <- contrasts.fit(fit[keep, ], contrasts.matrix.berisha)
fit2.berisha <- eBayes(fit2.berisha, trend = TRUE, robust=TRUE)
table(de <- decideTests(fit2.berisha, p.value = 0.2)) ## relaxed FDR cutoff here!
##
## -1 0 1
## 7 12930 1
topTable(fit2.berisha)
## GENEID SYMBOL UNIPROT GENENAME
## ENSG00000148346 ENSG00000148346 LCN2 P80188.206 lipocalin 2 [Source:HGNC Symbol;Acc:HGNC:6526]
## ENSG00000124469 ENSG00000124469 CEACAM8 Q0Z7S6.114 CEA cell adhesion molecule 8 [Source:HGNC Symbol;Acc:HGNC:1820]
## ENSG00000164047 ENSG00000164047 CAMP A0A384NPR0.7 cathelicidin antimicrobial peptide [Source:HGNC Symbol;Acc:HGNC:1472]
## ENSG00000173391 ENSG00000173391 OLR1 A0A024RAU0.50 oxidized low density lipoprotein receptor 1 [Source:HGNC Symbol;Acc:HGNC:8133]
## ENSG00000132950 ENSG00000132950 ZMYM5 <NA> zinc finger MYM-type containing 5 [Source:HGNC Symbol;Acc:HGNC:13029]
## ENSG00000206047 ENSG00000206047 DEFA1 P59665.167 defensin alpha 1 [Source:HGNC Symbol;Acc:HGNC:2761]
## ENSG00000164821 ENSG00000164821 DEFA4 P12838.160 defensin alpha 4 [Source:HGNC Symbol;Acc:HGNC:2763]
## ENSG00000169397 ENSG00000169397 RNASE3 P12724.191 ribonuclease A family member 3 [Source:HGNC Symbol;Acc:HGNC:10046]
## ENSG00000223609 ENSG00000223609 HBD E9PFT6.63 hemoglobin subunit delta [Source:HGNC Symbol;Acc:HGNC:4829]
## ENSG00000007944 ENSG00000007944 MYLIP Q8WY64.169 myosin regulatory light chain interacting protein [Source:HGNC Symbol;Acc:HGNC:21155]
## logFC AveExpr t P.Value adj.P.Val B
## ENSG00000148346 -0.9693081 8.914254 -7.411117 1.423487e-06 0.01841707 3.7564295
## ENSG00000124469 -0.9649413 9.087539 -6.975751 3.029591e-06 0.01959842 3.2898716
## ENSG00000164047 -0.8342927 10.282562 -6.550220 6.498199e-06 0.02802456 2.8018375
## ENSG00000173391 -0.7458480 7.697638 -6.054068 1.632259e-05 0.05279540 2.1914166
## ENSG00000132950 0.2515660 6.819214 5.748582 2.926057e-05 0.07571465 1.7930338
## ENSG00000206047 -1.0426848 11.519667 -5.604153 3.872615e-05 0.08350649 1.5987012
## ENSG00000164821 -0.6096806 7.585210 -5.522193 4.545724e-05 0.08401797 1.4867242
## ENSG00000169397 -0.3631733 7.325367 -5.106690 1.037699e-04 0.16782186 0.9004738
## ENSG00000223609 -0.9005301 9.539256 -4.883716 1.629694e-04 0.23427750 0.5734816
## ENSG00000007944 0.3466481 11.990352 4.733226 2.216880e-04 0.28681996 0.3481504
## Comparison of fold changes ##################################################
top.x <- topTable(fit2, coef="Post_vs_pre", sort.by = "none", n=Inf)
top.y <- topTable(fit2.berisha, coef="Post_vs_pre", sort.by = "none", n=Inf)
myDf <- merge(top.x, top.y, by="GENEID")
## Total least squares regression
v <- prcomp(myDf[, c("logFC.x", "logFC.y")])
slp <- v$rotation[2,1] / v$rotation[1,1]
int <- v$center[2] - slp*v$center[1]
eq <- as.expression(substitute(italic(y) == b %.% italic(x) + a,
list(a = format(unname(int), digits = 2),
b = format(unname(slp), digits = 2))))
## Labeling specific categories of genes
myDf$highlight <- NA
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 4]] <- "NFKB"
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 7]] <- "Neutrophils"
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 8]] <- "Erythrocytes"
myDf$highlight[myDf$SYMBOL.x %in% stringClusters$protein.name[stringClusters$cluster.number == 12]] <- "Hemoglobin"
## We add all other hemoglobin subunits
myDf$highlight[grep("^hemoglobin subunit", myDf$DESCRIPTION)] <- "Hemoglobin"
myDf$highlight[grep("^MT-", myDf$SYMBOL.x)] <- "Mitochondrial"
table(myDf$highlight)
##
## Erythrocytes Hemoglobin Neutrophils NFKB
## 13 9 11 15
## Sort data frame so that colored points are plotted last
myDf$highlight <- factor(myDf$highlight, levels = c("Neutrophils", "NFKB", "Erythrocytes", "Hemoglobin", "Mitochondrial"))
myDf <- myDf[order(myDf$highlight), ]
myDf <- rbind(myDf[is.na(myDf$highlight), ],
myDf[!is.na(myDf$highlight), ])
f7p3 <- ggplot(myDf, aes(x=logFC.x, y=logFC.y)) +
ylab(bquote('log'[2]~'Fold-Change')) +
xlab(bquote('log'[2]~'Fold-Change')) +
geom_point(aes(color=highlight, alpha=highlight, size=highlight), shape=16) +
geom_density_2d(col="grey30") +
## Correlation coefficient
stat_cor(label.x = -2.12, label.y = 2.12, aes(label = after_stat(r.label)), size=7, hjust=0) +
## Regression line slope and intercept
geom_abline(slope=slp, intercept=int, linewidth = 1, color="grey30") +
annotate(geom="text", x=-2.12, y=1.8,
label=eq, parse = TRUE, # See above
size=7, hjust=0, vjust=0) +
## Fix limits so that range is equal on x and y axes
xlim(c(-2.12, 2.12)) +
ylim(c(-2.12, 2.12)) +
scale_color_manual(name="",
limits=sort(unique(myDf$highlight)),
values=unname(c(myColorNeutro, myColorNfkb, myColorErythro, myColorHemo, myColorMito)),
na.value = "grey80") +
scale_size_manual(values=c(2,2,2,2,2), guide="none", na.value = 1) +
scale_alpha_manual(values=c(1,1,1,1,1), guide="none", na.value = 0.5) +
annotate(geom = "text", x=-2.12, y=0, label="6-12 months post vs. pre-surgery (Berisha et al.)", angle = 90, size=5, fontface = 'italic') +
annotate(geom = "text", x=0, y=-2.12, label="V4 vs. V1 (this study)", size=5, fontface = 'italic') +
myTheme
f7p3